Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
20c5fd39
Unverified
Commit
20c5fd39
authored
May 08, 2025
by
Devon Rifkin
Committed by
GitHub
May 08, 2025
Browse files
Merge branch 'main' into drifkin/array-head-count-simple
parents
d2ee599d
6e9a7a25
Changes
156
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
264 additions
and
251 deletions
+264
-251
llama/llama.cpp/src/llama-chat.h
llama/llama.cpp/src/llama-chat.h
+3
-2
llama/llama.cpp/src/llama-context.cpp
llama/llama.cpp/src/llama-context.cpp
+4
-17
llama/llama.cpp/src/llama-context.h
llama/llama.cpp/src/llama-context.h
+1
-2
llama/llama.cpp/src/llama-graph.cpp
llama/llama.cpp/src/llama-graph.cpp
+42
-16
llama/llama.cpp/src/llama-graph.h
llama/llama.cpp/src/llama-graph.h
+5
-7
llama/llama.cpp/src/llama-hparams.h
llama/llama.cpp/src/llama-hparams.h
+1
-0
llama/llama.cpp/src/llama-model.cpp
llama/llama.cpp/src/llama-model.cpp
+59
-13
llama/llama.cpp/src/llama-model.h
llama/llama.cpp/src/llama-model.h
+6
-2
llama/llama.cpp/src/llama-quant.cpp
llama/llama.cpp/src/llama-quant.cpp
+0
-4
llama/llama.cpp/src/llama-sampling.cpp
llama/llama.cpp/src/llama-sampling.cpp
+2
-1
llama/llama.cpp/src/llama-vocab.cpp
llama/llama.cpp/src/llama-vocab.cpp
+2
-1
llama/llama.go
llama/llama.go
+1
-26
llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
...gml-backend-malloc-and-free-using-the-same-compiler.patch
+12
-12
llama/patches/0002-pretokenizer.patch
llama/patches/0002-pretokenizer.patch
+2
-2
llama/patches/0003-embeddings.patch
llama/patches/0003-embeddings.patch
+4
-4
llama/patches/0004-clip-unicode.patch
llama/patches/0004-clip-unicode.patch
+5
-5
llama/patches/0005-solar-pro.patch
llama/patches/0005-solar-pro.patch
+21
-21
llama/patches/0006-add-mllama-support.patch
llama/patches/0006-add-mllama-support.patch
+65
-87
llama/patches/0007-add-unpad-operator.patch
llama/patches/0007-add-unpad-operator.patch
+28
-28
llama/patches/0008-fix-deepseek-deseret-regex.patch
llama/patches/0008-fix-deepseek-deseret-regex.patch
+1
-1
No files found.
llama/llama.cpp/src/llama-chat.h
View file @
20c5fd39
...
@@ -29,8 +29,8 @@ enum llm_chat_template {
...
@@ -29,8 +29,8 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_DEEPSEEK_3
,
LLM_CHAT_TEMPLATE_DEEPSEEK_3
,
LLM_CHAT_TEMPLATE_COMMAND_R
,
LLM_CHAT_TEMPLATE_COMMAND_R
,
LLM_CHAT_TEMPLATE_LLAMA_3
,
LLM_CHAT_TEMPLATE_LLAMA_3
,
LLM_CHAT_TEMPLATE_CHATG
M
L_3
,
LLM_CHAT_TEMPLATE_CHATGL
M
_3
,
LLM_CHAT_TEMPLATE_CHATG
M
L_4
,
LLM_CHAT_TEMPLATE_CHATGL
M
_4
,
LLM_CHAT_TEMPLATE_GLMEDGE
,
LLM_CHAT_TEMPLATE_GLMEDGE
,
LLM_CHAT_TEMPLATE_MINICPM
,
LLM_CHAT_TEMPLATE_MINICPM
,
LLM_CHAT_TEMPLATE_EXAONE_3
,
LLM_CHAT_TEMPLATE_EXAONE_3
,
...
@@ -41,6 +41,7 @@ enum llm_chat_template {
...
@@ -41,6 +41,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_YANDEX
,
LLM_CHAT_TEMPLATE_YANDEX
,
LLM_CHAT_TEMPLATE_BAILING
,
LLM_CHAT_TEMPLATE_BAILING
,
LLM_CHAT_TEMPLATE_LLAMA4
,
LLM_CHAT_TEMPLATE_LLAMA4
,
LLM_CHAT_TEMPLATE_SMOLVLM
,
LLM_CHAT_TEMPLATE_UNKNOWN
,
LLM_CHAT_TEMPLATE_UNKNOWN
,
};
};
...
...
llama/llama.cpp/src/llama-context.cpp
View file @
20c5fd39
...
@@ -114,7 +114,7 @@ llama_context::llama_context(
...
@@ -114,7 +114,7 @@ llama_context::llama_context(
}
}
if
(
n_ctx_per_seq
>
hparams
.
n_ctx_train
)
{
if
(
n_ctx_per_seq
>
hparams
.
n_ctx_train
)
{
LLAMA_LOG_WARN
(
"%s: n_ctx_p
r
e_seq (%u) > n_ctx_train (%u) -- possible training context overflow
\n
"
,
LLAMA_LOG_WARN
(
"%s: n_ctx_pe
r
_seq (%u) > n_ctx_train (%u) -- possible training context overflow
\n
"
,
__func__
,
n_ctx_per_seq
,
hparams
.
n_ctx_train
);
__func__
,
n_ctx_per_seq
,
hparams
.
n_ctx_train
);
}
}
...
@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
...
@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
ggml_tensor
*
shift
,
ggml_tensor
*
shift
,
ggml_tensor
*
factors
,
ggml_tensor
*
factors
,
float
freq_base
,
float
freq_base
,
float
freq_scale
,
float
freq_scale
)
const
{
ggml_backend_buffer
*
bbuf
)
const
{
const
auto
&
n_ctx_orig
=
cparams
.
n_ctx_orig_yarn
;
const
auto
&
n_ctx_orig
=
cparams
.
n_ctx_orig_yarn
;
const
auto
&
yarn_ext_factor
=
cparams
.
yarn_ext_factor
;
const
auto
&
yarn_ext_factor
=
cparams
.
yarn_ext_factor
;
...
@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
...
@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
// dequantize to f32 -> RoPE -> quantize back
// dequantize to f32 -> RoPE -> quantize back
tmp
=
ggml_cast
(
ctx0
,
cur
,
GGML_TYPE_F32
);
tmp
=
ggml_cast
(
ctx0
,
cur
,
GGML_TYPE_F32
);
if
(
bbuf
)
{
tmp
=
ggml_rope_ext
(
ctx0
,
tmp
,
for
(
const
auto
&
backend
:
backends
)
{
// Figure out which backend KV cache belongs to
if
(
ggml_backend_supports_buft
(
backend
.
get
(),
ggml_backend_buffer_get_type
(
bbuf
)))
{
ggml_backend_sched_set_tensor_backend
(
sched
.
get
(),
tmp
,
backend
.
get
());
break
;
}
}
}
tmp
=
ggml_rope_ext_inplace
(
ctx0
,
tmp
,
shift
,
factors
,
n_rot
,
rope_type
,
n_ctx_orig
,
freq_base
,
freq_scale
,
shift
,
factors
,
n_rot
,
rope_type
,
n_ctx_orig
,
freq_base
,
freq_scale
,
yarn_ext_factor
,
yarn_attn_factor
,
yarn_beta_fast
,
yarn_beta_slow
);
yarn_ext_factor
,
yarn_attn_factor
,
yarn_beta_fast
,
yarn_beta_slow
);
...
@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
...
@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_k_gqa
),
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_k_gqa
),
0
);
0
);
ggml_tensor
*
cur
=
build_rope_shift
(
ctx0
,
k
,
inp
->
k_shift
,
rope_factors
,
freq_base_l
,
freq_scale_l
,
kv_self
->
k_l
[
il
]
->
buffer
);
ggml_tensor
*
cur
=
build_rope_shift
(
ctx0
,
k
,
inp
->
k_shift
,
rope_factors
,
freq_base_l
,
freq_scale_l
);
ggml_build_forward_expand
(
gf
,
cur
);
ggml_build_forward_expand
(
gf
,
cur
);
}
}
...
@@ -1510,8 +1499,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
...
@@ -1510,8 +1499,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
// set all ids as invalid (negative)
// set all ids as invalid (negative)
std
::
fill
(
output_ids
.
begin
(),
output_ids
.
end
(),
-
1
);
std
::
fill
(
output_ids
.
begin
(),
output_ids
.
end
(),
-
1
);
ggml_backend_buffer_clear
(
buf_output
.
get
(),
0
);
this
->
n_outputs
=
0
;
this
->
n_outputs
=
0
;
this
->
n_outputs_max
=
n_outputs_max
;
this
->
n_outputs_max
=
n_outputs_max
;
...
...
llama/llama.cpp/src/llama-context.h
View file @
20c5fd39
...
@@ -172,8 +172,7 @@ private:
...
@@ -172,8 +172,7 @@ private:
ggml_tensor
*
shift
,
ggml_tensor
*
shift
,
ggml_tensor
*
factors
,
ggml_tensor
*
factors
,
float
freq_base
,
float
freq_base
,
float
freq_scale
,
float
freq_scale
)
const
;
ggml_backend_buffer
*
bbuf
)
const
;
llm_graph_result_ptr
build_kv_self_shift
(
llm_graph_result_ptr
build_kv_self_shift
(
ggml_context
*
ctx0
,
ggml_context
*
ctx0
,
...
...
llama/llama.cpp/src/llama-graph.cpp
View file @
20c5fd39
...
@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
...
@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
if
(
ubatch
->
pos
&&
pos
)
{
if
(
ubatch
->
pos
&&
pos
)
{
const
int64_t
n_tokens
=
ubatch
->
n_tokens
;
const
int64_t
n_tokens
=
ubatch
->
n_tokens
;
ggml_backend_tensor_set
(
pos
,
ubatch
->
pos
,
0
,
n_tokens
*
n_pos_per_token
*
ggml_element_size
(
pos
));
if
(
ubatch
->
token
&&
n_pos_per_embd
==
4
)
{
// in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
// the 3 first dims are the same, and 4th dim is all 0
std
::
vector
<
llama_pos
>
pos_data
(
n_tokens
*
n_pos_per_embd
);
// copy the first dimension
for
(
int
i
=
0
;
i
<
n_tokens
;
++
i
)
{
pos_data
[
i
]
=
ubatch
->
pos
[
i
];
pos_data
[
n_tokens
+
i
]
=
ubatch
->
pos
[
i
];
pos_data
[
2
*
n_tokens
+
i
]
=
ubatch
->
pos
[
i
];
pos_data
[
3
*
n_tokens
+
i
]
=
0
;
// 4th dim is 0
}
ggml_backend_tensor_set
(
pos
,
pos_data
.
data
(),
0
,
pos_data
.
size
()
*
ggml_element_size
(
pos
));
}
else
{
ggml_backend_tensor_set
(
pos
,
ubatch
->
pos
,
0
,
n_tokens
*
n_pos_per_embd
*
ggml_element_size
(
pos
));
}
}
}
}
}
...
@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
...
@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
)
*
f_attn_temp_scale
+
1.0
;
)
*
f_attn_temp_scale
+
1.0
;
}
}
ggml_backend_tensor_set
(
attn_scale
,
attn_scale_data
.
data
(),
0
,
n_tokens
*
n_pos_per_token
*
ggml_element_size
(
attn_scale
));
ggml_backend_tensor_set
(
attn_scale
,
attn_scale_data
.
data
(),
0
,
n_tokens
*
ggml_element_size
(
attn_scale
));
}
}
}
}
...
@@ -598,7 +612,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
...
@@ -598,7 +612,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
res
(
std
::
make_unique
<
llm_graph_result
>
())
{
res
(
std
::
make_unique
<
llm_graph_result
>
())
{
}
}
int64_t
llm_graph_context
::
n_pos_per_
token
()
const
{
int64_t
llm_graph_context
::
n_pos_per_
embd
()
const
{
return
arch
==
LLM_ARCH_QWEN2VL
?
4
:
1
;
return
arch
==
LLM_ARCH_QWEN2VL
?
4
:
1
;
}
}
...
@@ -809,6 +823,10 @@ ggml_tensor * llm_graph_context::build_ffn(
...
@@ -809,6 +823,10 @@ ggml_tensor * llm_graph_context::build_ffn(
if
(
down
)
{
if
(
down
)
{
cur
=
build_lora_mm
(
down
,
cur
);
cur
=
build_lora_mm
(
down
,
cur
);
if
(
arch
==
LLM_ARCH_GLM4
)
{
// GLM4 seems to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec
(
cur
,
GGML_PREC_F32
);
}
}
}
if
(
down_b
)
{
if
(
down_b
)
{
...
@@ -916,28 +934,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
...
@@ -916,28 +934,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor
*
up
=
build_lora_mm_id
(
up_exps
,
cur
,
selected_experts
);
// [n_ff, n_expert_used, n_tokens]
ggml_tensor
*
up
=
build_lora_mm_id
(
up_exps
,
cur
,
selected_experts
);
// [n_ff, n_expert_used, n_tokens]
cb
(
up
,
"ffn_moe_up"
,
il
);
cb
(
up
,
"ffn_moe_up"
,
il
);
ggml_tensor
*
gate
=
build_lora_mm_id
(
gate_exps
,
cur
,
selected_experts
);
// [n_ff, n_expert_used, n_tokens]
ggml_tensor
*
experts
=
nullptr
;
cb
(
gate
,
"ffn_moe_gate"
,
il
);
if
(
gate_exps
)
{
cur
=
build_lora_mm_id
(
gate_exps
,
cur
,
selected_experts
);
// [n_ff, n_expert_used, n_tokens]
cb
(
cur
,
"ffn_moe_gate"
,
il
);
}
else
{
cur
=
up
;
}
switch
(
type_op
)
{
switch
(
type_op
)
{
case
LLM_FFN_SILU
:
case
LLM_FFN_SILU
:
{
{
gate
=
ggml_silu
(
ctx0
,
gate
);
cur
=
ggml_silu
(
ctx0
,
cur
);
cb
(
gate
,
"ffn_moe_silu"
,
il
);
cb
(
cur
,
"ffn_moe_silu"
,
il
);
}
break
;
}
break
;
case
LLM_FFN_GELU
:
case
LLM_FFN_GELU
:
{
{
gate
=
ggml_gelu
(
ctx0
,
gate
);
cur
=
ggml_gelu
(
ctx0
,
cur
);
cb
(
gate
,
"ffn_moe_gelu"
,
il
);
cb
(
cur
,
"ffn_moe_gelu"
,
il
);
}
break
;
}
break
;
default:
default:
GGML_ABORT
(
"fatal error"
);
GGML_ABORT
(
"fatal error"
);
}
}
ggml_tensor
*
par
=
ggml_mul
(
ctx0
,
up
,
gate
);
// [n_ff, n_expert_used, n_tokens]
if
(
gate_exps
)
{
cb
(
par
,
"ffn_moe_gate_par"
,
il
);
cur
=
ggml_mul
(
ctx0
,
cur
,
up
);
// [n_ff, n_expert_used, n_tokens]
cb
(
cur
,
"ffn_moe_gate_par"
,
il
);
}
ggml_tensor
*
experts
=
build_lora_mm_id
(
down_exps
,
pa
r
,
selected_experts
);
// [n_embd, n_expert_used, n_tokens]
experts
=
build_lora_mm_id
(
down_exps
,
cu
r
,
selected_experts
);
// [n_embd, n_expert_used, n_tokens]
cb
(
experts
,
"ffn_moe_down"
,
il
);
cb
(
experts
,
"ffn_moe_down"
,
il
);
if
(
!
weight_before_ffn
)
{
if
(
!
weight_before_ffn
)
{
...
@@ -1020,11 +1045,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
...
@@ -1020,11 +1045,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
}
}
ggml_tensor
*
llm_graph_context
::
build_inp_pos
()
const
{
ggml_tensor
*
llm_graph_context
::
build_inp_pos
()
const
{
auto
inp
=
std
::
make_unique
<
llm_graph_input_pos
>
(
n_pos_per_
token
());
auto
inp
=
std
::
make_unique
<
llm_graph_input_pos
>
(
n_pos_per_
embd
());
auto
&
cur
=
inp
->
pos
;
auto
&
cur
=
inp
->
pos
;
cur
=
ggml_new_tensor_1d
(
ctx0
,
GGML_TYPE_I32
,
n_tokens
*
n_pos_per_
token
());
cur
=
ggml_new_tensor_1d
(
ctx0
,
GGML_TYPE_I32
,
n_tokens
*
n_pos_per_
embd
());
ggml_set_input
(
cur
);
ggml_set_input
(
cur
);
res
->
add_input
(
std
::
move
(
inp
));
res
->
add_input
(
std
::
move
(
inp
));
...
@@ -1033,11 +1058,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
...
@@ -1033,11 +1058,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
}
}
ggml_tensor
*
llm_graph_context
::
build_inp_attn_scale
()
const
{
ggml_tensor
*
llm_graph_context
::
build_inp_attn_scale
()
const
{
auto
inp
=
std
::
make_unique
<
llm_graph_input_attn_temp
>
(
n_pos_per_token
(),
hparams
.
n_attn_temp_floor_scale
,
hparams
.
f_attn_temp_scale
);
auto
inp
=
std
::
make_unique
<
llm_graph_input_attn_temp
>
(
hparams
.
n_attn_temp_floor_scale
,
hparams
.
f_attn_temp_scale
);
auto
&
cur
=
inp
->
attn_scale
;
auto
&
cur
=
inp
->
attn_scale
;
cur
=
ggml_new_tensor_3d
(
ctx0
,
GGML_TYPE_F32
,
1
,
1
,
n_tokens
*
n_pos_per_token
());
// this need to be 1x1xN for broadcasting
cur
=
ggml_new_tensor_3d
(
ctx0
,
GGML_TYPE_F32
,
1
,
1
,
n_tokens
);
ggml_set_input
(
cur
);
ggml_set_input
(
cur
);
res
->
add_input
(
std
::
move
(
inp
));
res
->
add_input
(
std
::
move
(
inp
));
...
...
llama/llama.cpp/src/llama-graph.h
View file @
20c5fd39
...
@@ -91,29 +91,27 @@ public:
...
@@ -91,29 +91,27 @@ public:
class
llm_graph_input_pos
:
public
llm_graph_input_i
{
class
llm_graph_input_pos
:
public
llm_graph_input_i
{
public:
public:
llm_graph_input_pos
(
int64_t
n_pos_per_
token
)
:
n_pos_per_
token
(
n_pos_per_
token
)
{}
llm_graph_input_pos
(
int64_t
n_pos_per_
embd
)
:
n_pos_per_
embd
(
n_pos_per_
embd
)
{}
virtual
~
llm_graph_input_pos
()
=
default
;
virtual
~
llm_graph_input_pos
()
=
default
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
pos
=
nullptr
;
// I32 [n_batch]
ggml_tensor
*
pos
=
nullptr
;
// I32 [n_batch]
const
int64_t
n_pos_per_
token
=
1
;
const
int64_t
n_pos_per_
embd
=
1
;
};
};
// temperature tuning, used by llama4
// temperature tuning, used by llama4
class
llm_graph_input_attn_temp
:
public
llm_graph_input_i
{
class
llm_graph_input_attn_temp
:
public
llm_graph_input_i
{
public:
public:
llm_graph_input_attn_temp
(
int64_t
n_pos_per_token
,
uint32_t
n_attn_temp_floor_scale
,
float
f_attn_temp_scale
)
llm_graph_input_attn_temp
(
uint32_t
n_attn_temp_floor_scale
,
float
f_attn_temp_scale
)
:
n_pos_per_token
(
n_pos_per_token
),
n_attn_temp_floor_scale
(
n_attn_temp_floor_scale
),
f_attn_temp_scale
(
f_attn_temp_scale
)
{}
:
n_attn_temp_floor_scale
(
n_attn_temp_floor_scale
),
f_attn_temp_scale
(
f_attn_temp_scale
)
{}
virtual
~
llm_graph_input_attn_temp
()
=
default
;
virtual
~
llm_graph_input_attn_temp
()
=
default
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
attn_scale
=
nullptr
;
// F32 [n_batch]
ggml_tensor
*
attn_scale
=
nullptr
;
// F32 [n_batch]
const
int64_t
n_pos_per_token
=
1
;
const
uint32_t
n_attn_temp_floor_scale
;
const
uint32_t
n_attn_temp_floor_scale
;
const
float
f_attn_temp_scale
;
const
float
f_attn_temp_scale
;
};
};
...
@@ -430,7 +428,7 @@ struct llm_graph_context {
...
@@ -430,7 +428,7 @@ struct llm_graph_context {
llm_graph_context
(
const
llm_graph_params
&
params
);
llm_graph_context
(
const
llm_graph_params
&
params
);
int64_t
n_pos_per_
token
()
const
;
int64_t
n_pos_per_
embd
()
const
;
void
cb
(
ggml_tensor
*
cur
,
const
char
*
name
,
int
il
)
const
;
void
cb
(
ggml_tensor
*
cur
,
const
char
*
name
,
int
il
)
const
;
...
...
llama/llama.cpp/src/llama-hparams.h
View file @
20c5fd39
...
@@ -72,6 +72,7 @@ struct llama_hparams {
...
@@ -72,6 +72,7 @@ struct llama_hparams {
float
expert_weights_scale
=
0
.
0
;
float
expert_weights_scale
=
0
.
0
;
bool
expert_weights_norm
=
false
;
bool
expert_weights_norm
=
false
;
uint32_t
expert_gating_func
=
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE
;
uint32_t
expert_gating_func
=
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE
;
uint32_t
moe_every_n_layers
=
0
;
float
f_norm_eps
;
float
f_norm_eps
;
float
f_norm_rms_eps
;
float
f_norm_rms_eps
;
...
...
llama/llama.cpp/src/llama-model.cpp
View file @
20c5fd39
...
@@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) {
...
@@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_770M: return "770M";
case LLM_TYPE_770M: return "770M";
case LLM_TYPE_780M: return "780M";
case LLM_TYPE_780M: return "780M";
case LLM_TYPE_0_5B: return "0.5B";
case LLM_TYPE_0_5B: return "0.5B";
case LLM_TYPE_0_6B: return "0.6B";
case LLM_TYPE_1B: return "1B";
case LLM_TYPE_1B: return "1B";
case LLM_TYPE_1_3B: return "1.3B";
case LLM_TYPE_1_3B: return "1.3B";
case LLM_TYPE_1_4B: return "1.4B";
case LLM_TYPE_1_4B: return "1.4B";
case LLM_TYPE_1_5B: return "1.5B";
case LLM_TYPE_1_5B: return "1.5B";
case LLM_TYPE_1_6B: return "1.6B";
case LLM_TYPE_1_6B: return "1.6B";
case LLM_TYPE_1_7B: return "1.7B";
case LLM_TYPE_1_8B: return "1.8B";
case LLM_TYPE_1_8B: return "1.8B";
case LLM_TYPE_2B: return "2B";
case LLM_TYPE_2B: return "2B";
case LLM_TYPE_2_8B: return "2.8B";
case LLM_TYPE_2_8B: return "2.8B";
...
@@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) {
...
@@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_15B: return "15B";
case LLM_TYPE_15B: return "15B";
case LLM_TYPE_16B: return "16B";
case LLM_TYPE_16B: return "16B";
case LLM_TYPE_20B: return "20B";
case LLM_TYPE_20B: return "20B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_30B: return "30B";
case LLM_TYPE_30B: return "30B";
case LLM_TYPE_32B: return "32B";
case LLM_TYPE_32B: return "32B";
case LLM_TYPE_34B: return "34B";
case LLM_TYPE_34B: return "34B";
...
@@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) {
...
@@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_65B: return "65B";
case LLM_TYPE_65B: return "65B";
case LLM_TYPE_70B: return "70B";
case LLM_TYPE_70B: return "70B";
case LLM_TYPE_236B: return "236B";
case LLM_TYPE_236B: return "236B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_314B: return "314B";
case LLM_TYPE_314B: return "314B";
case LLM_TYPE_671B: return "671B";
case LLM_TYPE_671B: return "671B";
case LLM_TYPE_SMALL: return "0.1B";
case LLM_TYPE_SMALL: return "0.1B";
...
@@ -88,10 +92,10 @@ const char * llm_type_name(llm_type type) {
...
@@ -88,10 +92,10 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_16x3_8B: return "16x3.8B";
case LLM_TYPE_16x3_8B: return "16x3.8B";
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
case LLM_TYPE_57B_A14B: return "57B.A14B";
case LLM_TYPE_57B_A14B: return "57B.A14B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
default: return "?B";
default: return "?B";
}
}
}
}
...
@@ -709,10 +713,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -709,10 +713,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
}
} break;
} break;
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
type = LLM_TYPE_137M;
type = LLM_TYPE_137M;
...
@@ -805,6 +811,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -805,6 +811,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
{
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
switch (hparams.n_layer) {
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
case 40: type = LLM_TYPE_14B; break;
case 64: type = LLM_TYPE_32B; break;
default: type = LLM_TYPE_UNKNOWN;
default: type = LLM_TYPE_UNKNOWN;
}
}
} break;
} break;
...
@@ -814,6 +824,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -814,6 +824,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
switch (hparams.n_layer) {
case 48: type = LLM_TYPE_30B_A3B; break;
case 94: type = LLM_TYPE_235B_A22B; break;
default: type = LLM_TYPE_UNKNOWN;
default: type = LLM_TYPE_UNKNOWN;
}
}
} break;
} break;
...
@@ -1425,7 +1437,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -1425,7 +1437,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
default: type = LLM_TYPE_UNKNOWN;
}
}
} break;
} break;
case LLM_ARCH_MISTRAL3: break;
default: throw std::runtime_error("unsupported model architecture");
default: throw std::runtime_error("unsupported model architecture");
}
}
...
@@ -2133,6 +2144,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
...
@@ -2133,6 +2144,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} break;
} break;
case LLM_ARCH_BERT:
case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
...
@@ -2166,20 +2178,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
...
@@ -2166,20 +2178,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
}
}
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
}
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
if (arch == LLM_ARCH_BERT) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
} else {
} else {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
} else {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
}
}
}
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
...
@@ -6074,6 +6097,11 @@ struct llm_build_bert : public llm_graph_context {
...
@@ -6074,6 +6097,11 @@ struct llm_build_bert : public llm_graph_context {
cur = build_lora_mm(model.layers[il].wqkv, cur);
cur = build_lora_mm(model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
cb(cur, "wqkv", il);
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
cb(cur, "bqkv", il);
}
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
...
@@ -6126,13 +6154,29 @@ struct llm_build_bert : public llm_graph_context {
...
@@ -6126,13 +6154,29 @@ struct llm_build_bert : public llm_graph_context {
cb(ffn_inp, "ffn_inp", il);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
// feed-forward network
if (model.arch == LLM_ARCH_BERT) {
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
// MoE branch
cur = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
nullptr,
model.layers[il].ffn_down_exps,
nullptr,
hparams.n_expert,
hparams.n_expert_used,
LLM_FFN_GELU,
false, false,
0.0f,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
cb(cur, "ffn_moe_out", il);
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
cur = build_ffn(cur,
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, il);
LLM_FFN_GELU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il);
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
cur = build_ffn(cur,
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_up, NULL, NULL,
...
@@ -6140,6 +6184,7 @@ struct llm_build_bert : public llm_graph_context {
...
@@ -6140,6 +6184,7 @@ struct llm_build_bert : public llm_graph_context {
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_PAR, il);
LLM_FFN_GELU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
} else {
cur = build_ffn(cur,
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_up, NULL, NULL,
...
@@ -6147,8 +6192,8 @@ struct llm_build_bert : public llm_graph_context {
...
@@ -6147,8 +6192,8 @@ struct llm_build_bert : public llm_graph_context {
model.layers[il].ffn_down, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
}
}
cb(cur, "ffn_out", il);
// attentions bypass the intermediate layer
// attentions bypass the intermediate layer
cur = ggml_add(ctx0, cur, ffn_inp);
cur = ggml_add(ctx0, cur, ffn_inp);
...
@@ -13349,6 +13394,7 @@ llm_graph_result_ptr llama_model::build_graph(
...
@@ -13349,6 +13394,7 @@ llm_graph_result_ptr llama_model::build_graph(
case LLM_ARCH_BERT:
case LLM_ARCH_BERT:
case LLM_ARCH_JINA_BERT_V2:
case LLM_ARCH_JINA_BERT_V2:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{
{
llm = std::make_unique<llm_build_bert>(*this, params, gf);
llm = std::make_unique<llm_build_bert>(*this, params, gf);
} break;
} break;
...
@@ -13705,7 +13751,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
...
@@ -13705,7 +13751,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_CHAMELEON:
case LLM_ARCH_CHAMELEON:
case LLM_ARCH_SOLAR:
case LLM_ARCH_SOLAR:
case LLM_ARCH_BAILINGMOE:
case LLM_ARCH_BAILINGMOE:
case LLM_ARCH_MISTRAL3:
return LLAMA_ROPE_TYPE_NORM;
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
// the pairs of head values are offset by n_rot/2
...
@@ -13714,6 +13759,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
...
@@ -13714,6 +13759,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_DBRX:
case LLM_ARCH_DBRX:
case LLM_ARCH_BERT:
case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_STABLELM:
case LLM_ARCH_STABLELM:
case LLM_ARCH_BITNET:
case LLM_ARCH_BITNET:
case LLM_ARCH_QWEN:
case LLM_ARCH_QWEN:
...
...
llama/llama.cpp/src/llama-model.h
View file @
20c5fd39
...
@@ -40,11 +40,13 @@ enum llm_type {
...
@@ -40,11 +40,13 @@ enum llm_type {
LLM_TYPE_770M
,
LLM_TYPE_770M
,
LLM_TYPE_780M
,
LLM_TYPE_780M
,
LLM_TYPE_0_5B
,
LLM_TYPE_0_5B
,
LLM_TYPE_0_6B
,
LLM_TYPE_1B
,
LLM_TYPE_1B
,
LLM_TYPE_1_3B
,
LLM_TYPE_1_3B
,
LLM_TYPE_1_4B
,
LLM_TYPE_1_4B
,
LLM_TYPE_1_5B
,
LLM_TYPE_1_5B
,
LLM_TYPE_1_6B
,
LLM_TYPE_1_6B
,
LLM_TYPE_1_7B
,
LLM_TYPE_1_8B
,
LLM_TYPE_1_8B
,
LLM_TYPE_2B
,
LLM_TYPE_2B
,
LLM_TYPE_2_8B
,
LLM_TYPE_2_8B
,
...
@@ -64,6 +66,7 @@ enum llm_type {
...
@@ -64,6 +66,7 @@ enum llm_type {
LLM_TYPE_16B
,
LLM_TYPE_16B
,
LLM_TYPE_20B
,
LLM_TYPE_20B
,
LLM_TYPE_22B
,
LLM_TYPE_22B
,
LLM_TYPE_27B
,
LLM_TYPE_30B
,
LLM_TYPE_30B
,
LLM_TYPE_32B
,
LLM_TYPE_32B
,
LLM_TYPE_34B
,
LLM_TYPE_34B
,
...
@@ -73,6 +76,7 @@ enum llm_type {
...
@@ -73,6 +76,7 @@ enum llm_type {
LLM_TYPE_70B
,
LLM_TYPE_70B
,
LLM_TYPE_90B
,
LLM_TYPE_90B
,
LLM_TYPE_236B
,
LLM_TYPE_236B
,
LLM_TYPE_290B
,
LLM_TYPE_314B
,
LLM_TYPE_314B
,
LLM_TYPE_671B
,
LLM_TYPE_671B
,
LLM_TYPE_SMALL
,
LLM_TYPE_SMALL
,
...
@@ -87,10 +91,10 @@ enum llm_type {
...
@@ -87,10 +91,10 @@ enum llm_type {
LLM_TYPE_16x3_8B
,
LLM_TYPE_16x3_8B
,
LLM_TYPE_10B_128x3_66B
,
LLM_TYPE_10B_128x3_66B
,
LLM_TYPE_57B_A14B
,
LLM_TYPE_57B_A14B
,
LLM_TYPE_27B
,
LLM_TYPE_290B
,
LLM_TYPE_17B_16E
,
// llama4 Scout
LLM_TYPE_17B_16E
,
// llama4 Scout
LLM_TYPE_17B_128E
,
// llama4 Maverick
LLM_TYPE_17B_128E
,
// llama4 Maverick
LLM_TYPE_30B_A3B
,
LLM_TYPE_235B_A22B
,
};
};
struct
llama_layer_posnet
{
struct
llama_layer_posnet
{
...
...
llama/llama.cpp/src/llama-quant.cpp
View file @
20c5fd39
...
@@ -744,10 +744,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
...
@@ -744,10 +744,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// This used to be a regex, but <regex> has an extreme cost to compile times.
// This used to be a regex, but <regex> has an extreme cost to compile times.
bool
quantize
=
name
.
rfind
(
"weight"
)
==
name
.
size
()
-
6
;
// ends with 'weight'?
bool
quantize
=
name
.
rfind
(
"weight"
)
==
name
.
size
()
-
6
;
// ends with 'weight'?
// don't quantize vision stuff
quantize
&=
name
.
find
(
"v."
)
==
std
::
string
::
npos
;
quantize
&=
name
.
find
(
"mm."
)
==
std
::
string
::
npos
;
// quantize only 2D and 3D tensors (experts)
// quantize only 2D and 3D tensors (experts)
quantize
&=
(
ggml_n_dims
(
tensor
)
>=
2
);
quantize
&=
(
ggml_n_dims
(
tensor
)
>=
2
);
...
...
llama/llama.cpp/src/llama-sampling.cpp
View file @
20c5fd39
...
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
...
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
// }
// }
if
(
k
<=
0
)
{
if
(
k
<=
0
)
{
k
=
cur_p
->
size
;
return
;
}
}
k
=
std
::
min
(
k
,
(
int
)
cur_p
->
size
);
k
=
std
::
min
(
k
,
(
int
)
cur_p
->
size
);
...
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
...
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
}
}
cur_p
->
sorted
=
true
;
cur_p
->
sorted
=
true
;
}
}
cur_p
->
size
=
k
;
cur_p
->
size
=
k
;
}
}
...
...
llama/llama.cpp/src/llama-vocab.cpp
View file @
20c5fd39
...
@@ -1497,7 +1497,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
@@ -1497,7 +1497,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre
==
"llama3"
||
tokenizer_pre
==
"llama3"
||
tokenizer_pre
==
"llama-v3"
||
tokenizer_pre
==
"llama-v3"
||
tokenizer_pre
==
"llama-bpe"
||
tokenizer_pre
==
"llama-bpe"
||
tokenizer_pre
==
"falcon3"
)
{
tokenizer_pre
==
"falcon3"
||
tokenizer_pre
==
"pixtral"
)
{
pre_type
=
LLAMA_VOCAB_PRE_TYPE_LLAMA3
;
pre_type
=
LLAMA_VOCAB_PRE_TYPE_LLAMA3
;
ignore_merges
=
true
;
ignore_merges
=
true
;
add_bos
=
true
;
add_bos
=
true
;
...
...
llama/llama.go
View file @
20c5fd39
...
@@ -2,6 +2,7 @@ package llama
...
@@ -2,6 +2,7 @@ package llama
/*
/*
#cgo CFLAGS: -std=c11
#cgo CFLAGS: -std=c11
#cgo windows CFLAGS: -Wno-dll-attribute-on-redeclaration
#cgo CXXFLAGS: -std=c++17
#cgo CXXFLAGS: -std=c++17
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common
...
@@ -198,7 +199,6 @@ type ModelParams struct {
...
@@ -198,7 +199,6 @@ type ModelParams struct {
NumGpuLayers
int
NumGpuLayers
int
MainGpu
int
MainGpu
int
UseMmap
bool
UseMmap
bool
UseMlock
bool
TensorSplit
[]
float32
TensorSplit
[]
float32
Progress
func
(
float32
)
Progress
func
(
float32
)
VocabOnly
bool
VocabOnly
bool
...
@@ -217,7 +217,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
...
@@ -217,7 +217,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
cparams
.
n_gpu_layers
=
C
.
int
(
params
.
NumGpuLayers
)
cparams
.
n_gpu_layers
=
C
.
int
(
params
.
NumGpuLayers
)
cparams
.
main_gpu
=
C
.
int32_t
(
params
.
MainGpu
)
cparams
.
main_gpu
=
C
.
int32_t
(
params
.
MainGpu
)
cparams
.
use_mmap
=
C
.
bool
(
params
.
UseMmap
)
cparams
.
use_mmap
=
C
.
bool
(
params
.
UseMmap
)
cparams
.
use_mlock
=
C
.
bool
(
params
.
UseMlock
)
cparams
.
vocab_only
=
C
.
bool
(
params
.
VocabOnly
)
cparams
.
vocab_only
=
C
.
bool
(
params
.
VocabOnly
)
if
len
(
params
.
TensorSplit
)
>
0
{
if
len
(
params
.
TensorSplit
)
>
0
{
...
@@ -461,24 +460,6 @@ func (m *Model) NEmbd() int {
...
@@ -461,24 +460,6 @@ func (m *Model) NEmbd() int {
return
int
(
C
.
llama_model_n_embd
(
m
.
c
))
return
int
(
C
.
llama_model_n_embd
(
m
.
c
))
}
}
func
Quantize
(
infile
,
outfile
string
,
ftype
uint32
)
error
{
cinfile
:=
C
.
CString
(
infile
)
defer
C
.
free
(
unsafe
.
Pointer
(
cinfile
))
coutfile
:=
C
.
CString
(
outfile
)
defer
C
.
free
(
unsafe
.
Pointer
(
coutfile
))
params
:=
C
.
llama_model_quantize_default_params
()
params
.
nthread
=
-
1
params
.
ftype
=
ftype
if
rc
:=
C
.
llama_model_quantize
(
cinfile
,
coutfile
,
&
params
);
rc
!=
0
{
return
fmt
.
Errorf
(
"llama_model_quantize: %d"
,
rc
)
}
return
nil
}
// vision processing
// vision processing
type
ClipContext
struct
{
type
ClipContext
struct
{
c
*
C
.
struct_clip_ctx
c
*
C
.
struct_clip_ctx
...
@@ -606,9 +587,6 @@ type SamplingParams struct {
...
@@ -606,9 +587,6 @@ type SamplingParams struct {
PenaltyRepeat
float32
PenaltyRepeat
float32
PenaltyFreq
float32
PenaltyFreq
float32
PenaltyPresent
float32
PenaltyPresent
float32
Mirostat
int
MirostatTau
float32
MirostatEta
float32
PenalizeNl
bool
PenalizeNl
bool
Seed
uint32
Seed
uint32
Grammar
string
Grammar
string
...
@@ -625,9 +603,6 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
...
@@ -625,9 +603,6 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
cparams
.
penalty_repeat
=
C
.
float
(
params
.
PenaltyRepeat
)
cparams
.
penalty_repeat
=
C
.
float
(
params
.
PenaltyRepeat
)
cparams
.
penalty_freq
=
C
.
float
(
params
.
PenaltyFreq
)
cparams
.
penalty_freq
=
C
.
float
(
params
.
PenaltyFreq
)
cparams
.
penalty_present
=
C
.
float
(
params
.
PenaltyFreq
)
cparams
.
penalty_present
=
C
.
float
(
params
.
PenaltyFreq
)
cparams
.
mirostat
=
C
.
int32_t
(
params
.
Mirostat
)
cparams
.
mirostat_tau
=
C
.
float
(
params
.
MirostatTau
)
cparams
.
mirostat_eta
=
C
.
float
(
params
.
MirostatEta
)
cparams
.
seed
=
C
.
uint32_t
(
params
.
Seed
)
cparams
.
seed
=
C
.
uint32_t
(
params
.
Seed
)
grammar
:=
C
.
CString
(
params
.
Grammar
)
grammar
:=
C
.
CString
(
params
.
Grammar
)
...
...
llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
View file @
20c5fd39
...
@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
...
@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
/**
/**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
a7febef7..31750b6f
100644
index
9fb2134f..04ce764e
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -534,6 +534,7 @@
struct ggml_backend_cuda_buffer_context {
@@ -534,6 +534,7 @@
struct ggml_backend_cuda_buffer_context {
...
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
...
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index
266d8af4..12886cd3
100644
index
d92392ed..425524d0
100644
--- a/ggml/src/ggml-metal/ggml-metal.m
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -
4759,6 +4759
,7 @@
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
@@ -
5077,6 +5077
,7 @@
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
}
}
free(ctx);
free(ctx);
...
@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
...
@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index
a0667b7d..bd83adc5
100644
index
140a775f..e33c4ba0
100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -4
68
,6 +4
68
,7 @@
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -4
77
,6 +4
77
,7 @@
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
GGML_ASSERT(status);
GGML_ASSERT(status);
delete ctx;
delete ctx;
...
@@ -161,10 +161,10 @@ index a0667b7d..bd83adc5 100644
...
@@ -161,10 +161,10 @@ index a0667b7d..bd83adc5 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index
1de34c96..4600f61
e 100644
index
66b6f2cc..e3e6dea
e 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -31
6
,6 +31
6
,7 @@
ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -31
7
,6 +31
7
,7 @@
ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
ggml_sycl_set_device(ctx->device);
ggml_sycl_set_device(ctx->device);
delete ctx;
delete ctx;
...
@@ -172,7 +172,7 @@ index 1de34c96..4600f61e 100644
...
@@ -172,7 +172,7 @@ index 1de34c96..4600f61e 100644
}
}
catch (sycl::exception const &exc) {
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -76
1
,6 +76
2
,7 @@
struct ggml_backend_sycl_split_buffer_context {
@@ -76
2
,6 +76
3
,7 @@
struct ggml_backend_sycl_split_buffer_context {
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
delete ctx;
delete ctx;
...
@@ -180,7 +180,7 @@ index 1de34c96..4600f61e 100644
...
@@ -180,7 +180,7 @@ index 1de34c96..4600f61e 100644
}
}
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -109
5
,6 +109
7
,7 @@
static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
@@ -109
6
,6 +109
8
,7 @@
static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_sycl_host_free(buffer->context);
ggml_sycl_host_free(buffer->context);
...
@@ -189,10 +189,10 @@ index 1de34c96..4600f61e 100644
...
@@ -189,10 +189,10 @@ index 1de34c96..4600f61e 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index
39f3cd34..c569a8a5
100644
index
c0bdb9e1..03d03064
100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -86
53
,6 +86
53
,7 @@
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -86
60
,6 +86
60
,7 @@
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer);
ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx;
delete ctx;
...
@@ -200,7 +200,7 @@ index 39f3cd34..c569a8a5 100644
...
@@ -200,7 +200,7 @@ index 39f3cd34..c569a8a5 100644
}
}
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -8
796
,6 +8
797
,7 @@
static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
@@ -8
803
,6 +8
804
,7 @@
static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
...
...
llama/patches/0002-pretokenizer.patch
View file @
20c5fd39
...
@@ -10,7 +10,7 @@ logs instead of throwing an error
...
@@ -10,7 +10,7 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-)
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index
48060517..a35b498c
100644
index
50ded286..a9ee9f03
100644
--- a/src/llama-vocab.cpp
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1491,16 +1491,7 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -1491,16 +1491,7 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
@@ -31,7 +31,7 @@ index 48060517..a35b498c 100644
...
@@ -31,7 +31,7 @@ index 48060517..a35b498c 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
} else if (
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama3" ||
@@ -163
4
,7 +162
5
,8 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -163
5
,7 +162
6
,8 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
clean_spaces = false;
clean_spaces = false;
} else {
} else {
...
...
llama/patches/0003-embeddings.patch
View file @
20c5fd39
...
@@ -11,10 +11,10 @@ instead of forcing one or the error
...
@@ -11,10 +11,10 @@ instead of forcing one or the error
1 file changed, 3 insertions(+), 3 deletions(-)
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index
983385f8..32f59819
100644
index
5a2eef9b..9c1fe93f
100644
--- a/src/llama-context.cpp
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -12
36
,7 +12
36
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -12
25
,7 +12
25
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
int64_t n_outputs_all = 0;
int64_t n_outputs_all = 0;
// count outputs
// count outputs
...
@@ -23,7 +23,7 @@ index 983385f8..32f59819 100644
...
@@ -23,7 +23,7 @@ index 983385f8..32f59819 100644
for (uint32_t i = 0; i < n_tokens_all; ++i) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs_all += batch.logits[i] != 0;
n_outputs_all += batch.logits[i] != 0;
}
}
@@ -13
48
,7 +13
48
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -13
37
,7 +13
37
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
//}
//}
...
@@ -32,7 +32,7 @@ index 983385f8..32f59819 100644
...
@@ -32,7 +32,7 @@ index 983385f8..32f59819 100644
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
if (t_embd && res->get_embd_pooled()) {
if (t_embd && res->get_embd_pooled()) {
@@ -14
92
,7 +14
92
,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
@@ -14
81
,7 +14
81
,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto n_embd = hparams.n_embd;
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
// TODO: use a per-batch flag for logits presence instead
...
...
llama/patches/0004-clip-unicode.patch
View file @
20c5fd39
...
@@ -10,12 +10,12 @@ filesystems for paths that include wide characters
...
@@ -10,12 +10,12 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+)
1 file changed, 39 insertions(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index
75970615..d57b4bd6
100644
index
ad3e7df1..b3218c78
100644
--- a/examples/llava/clip.cpp
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -29,6 +29,19 @@
@@ -30,6 +30,19 @@
#include <limits>
#include <array>
#include <array>
#include <numeric>
+#if defined(_WIN32)
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
...
@@ -33,7 +33,7 @@ index 75970615..d57b4bd6 100644
...
@@ -33,7 +33,7 @@ index 75970615..d57b4bd6 100644
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS
//#define CLIP_DEBUG_FUNCTIONS
@@ -1
430
,7 +1
443
,29 @@
struct clip_model_loader {
@@ -1
971
,7 +1
984
,29 @@
struct clip_model_loader {
{
{
std::vector<uint8_t> read_buf;
std::vector<uint8_t> read_buf;
...
@@ -63,7 +63,7 @@ index 75970615..d57b4bd6 100644
...
@@ -63,7 +63,7 @@ index 75970615..d57b4bd6 100644
if (!fin) {
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
}
@@ -1
457
,7 +
1492
,11 @@
struct clip_model_loader {
@@ -1
998
,7 +
2033
,11 @@
struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
}
}
...
...
llama/patches/0005-solar-pro.patch
View file @
20c5fd39
...
@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
...
@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
7 files changed, 248 insertions(+)
7 files changed, 248 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index
62e1480b..f754bc8f
100644
index
f2bc8ca7..5ab3f572
100644
--- a/src/llama-arch.cpp
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -6
8
,6 +6
8
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -6
9
,6 +6
9
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
...
@@ -26,7 +26,7 @@ index 62e1480b..f754bc8f 100644
...
@@ -26,7 +26,7 @@ index 62e1480b..f754bc8f 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -14
0
,6 +14
1
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -14
2
,6 +14
3
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
...
@@ -34,7 +34,7 @@ index 62e1480b..f754bc8f 100644
...
@@ -34,7 +34,7 @@ index 62e1480b..f754bc8f 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -1
48
2,6 +1
48
4,24 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -1
50
2,6 +1
50
4,24 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
},
},
...
@@ -59,7 +59,7 @@ index 62e1480b..f754bc8f 100644
...
@@ -59,7 +59,7 @@ index 62e1480b..f754bc8f 100644
{
{
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_WAVTOKENIZER_DEC,
{
{
@@ -16
6
0,6 +1
68
0,7 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -16
8
0,6 +1
70
0,7 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...
@@ -68,10 +68,10 @@ index 62e1480b..f754bc8f 100644
...
@@ -68,10 +68,10 @@ index 62e1480b..f754bc8f 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
diff --git a/src/llama-arch.h b/src/llama-arch.h
index
98ca00a1..439aaeab
100644
index
41a023da..525c1b7d
100644
--- a/src/llama-arch.h
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -7
2
,6 +7
2
,7 @@
enum llm_arch {
@@ -7
3
,6 +7
3
,7 @@
enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
LLM_ARCH_CHAMELEON,
...
@@ -79,7 +79,7 @@ index 98ca00a1..439aaeab 100644
...
@@ -79,7 +79,7 @@ index 98ca00a1..439aaeab 100644
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_PLM,
LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE,
LLM_ARCH_BAILINGMOE,
@@ -14
4
,6 +14
5
,7 @@
enum llm_kv {
@@ -14
6
,6 +14
7
,7 @@
enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_SCALE,
...
@@ -87,7 +87,7 @@ index 98ca00a1..439aaeab 100644
...
@@ -87,7 +87,7 @@ index 98ca00a1..439aaeab 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -34
4
,6 +34
6
,7 @@
enum llm_tensor {
@@ -34
6
,6 +34
8
,7 @@
enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_CLS_OUT,
...
@@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644
...
@@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644
if (il < n_layer) {
if (il < n_layer) {
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index
80fcd65d..6e278945
100644
index
7ee6a5b7..48dce407
100644
--- a/src/llama-hparams.h
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -55,6 +55,8 @@
struct llama_hparams {
@@ -55,6 +55,8 @@
struct llama_hparams {
...
@@ -127,7 +127,7 @@ index 80fcd65d..6e278945 100644
...
@@ -127,7 +127,7 @@ index 80fcd65d..6e278945 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
uint32_t n_lora_kv = 0;
@@ -15
3
,6 +15
5
,9 @@
struct llama_hparams {
@@ -15
4
,6 +15
6
,9 @@
struct llama_hparams {
// dimension of the recurrent state embeddings
// dimension of the recurrent state embeddings
uint32_t n_embd_v_s() const;
uint32_t n_embd_v_s() const;
...
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
...
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
llama_model_loader::llama_model_loader(
llama_model_loader::llama_model_loader(
const std::string & fname,
const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index
6b7bfecf..aba4281
9 100644
index
822e2bb2..572378c
9 100644
--- a/src/llama-model.cpp
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -13
74
,6 +13
74
,21 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -13
86
,6 +13
86
,21 @@
void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
default: type = LLM_TYPE_UNKNOWN;
}
}
} break;
} break;
...
@@ -175,7 +175,7 @@ index 6b7bfecf..aba42819 100644
...
@@ -175,7 +175,7 @@ index 6b7bfecf..aba42819 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_WAVTOKENIZER_DEC:
{
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -371
7
,6 +37
32
,34 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -37
4
1,6 +37
56
,34 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...
@@ -210,7 +210,7 @@ index 6b7bfecf..aba42819 100644
...
@@ -210,7 +210,7 @@ index 6b7bfecf..aba42819 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -12
296
,6 +123
39
,165 @@
struct llm_build_chameleon : public llm_graph_context {
@@ -12
342
,6 +123
85
,165 @@
struct llm_build_chameleon : public llm_graph_context {
}
}
};
};
...
@@ -376,7 +376,7 @@ index 6b7bfecf..aba42819 100644
...
@@ -376,7 +376,7 @@ index 6b7bfecf..aba42819 100644
struct llm_build_wavtokenizer_dec : public llm_graph_context {
struct llm_build_wavtokenizer_dec : public llm_graph_context {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
ggml_tensor * cur;
ggml_tensor * cur;
@@ -130
45
,6 +1324
7
,10 @@
llm_graph_result_ptr llama_model::build_graph(
@@ -130
92
,6 +132
9
4,10 @@
llm_graph_result_ptr llama_model::build_graph(
{
{
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
} break;
} break;
...
@@ -387,7 +387,7 @@ index 6b7bfecf..aba42819 100644
...
@@ -387,7 +387,7 @@ index 6b7bfecf..aba42819 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_WAVTOKENIZER_DEC:
{
{
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
@@ -13
191
,6 +13
397
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -13
238
,6 +13
444
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
case LLM_ARCH_CHAMELEON:
...
@@ -396,18 +396,18 @@ index 6b7bfecf..aba42819 100644
...
@@ -396,18 +396,18 @@ index 6b7bfecf..aba42819 100644
return LLAMA_ROPE_TYPE_NORM;
return LLAMA_ROPE_TYPE_NORM;
diff --git a/src/llama-model.h b/src/llama-model.h
diff --git a/src/llama-model.h b/src/llama-model.h
index
fd82d106..5865d5e9
100644
index
95eca002..856e6042
100644
--- a/src/llama-model.h
--- a/src/llama-model.h
+++ b/src/llama-model.h
+++ b/src/llama-model.h
@@ -6
2
,6 +6
2
,7 @@
enum llm_type {
@@ -6
4
,6 +6
4
,7 @@
enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_16B,
LLM_TYPE_20B,
LLM_TYPE_20B,
+ LLM_TYPE_22B,
+ LLM_TYPE_22B,
LLM_TYPE_27B,
LLM_TYPE_30B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_32B,
LLM_TYPE_34B,
@@ -311,6 +312,8 @@
struct llama_layer {
@@ -307,6 +308,8 @@
struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
...
...
llama/patches/0006-add-mllama-support.patch
View file @
20c5fd39
...
@@ -5,7 +5,6 @@ Subject: [PATCH] add mllama support
...
@@ -5,7 +5,6 @@ Subject: [PATCH] add mllama support
adds support for the llama 3.2 vision architecture
adds support for the llama 3.2 vision architecture
---
---
examples/llava/gemma3-cli.cpp | 3 +-
examples/llava/llava.cpp | 5 +-
examples/llava/llava.cpp | 5 +-
examples/llava/mtmd.cpp | 6 +-
examples/llava/mtmd.cpp | 6 +-
ggml/src/ggml-backend-reg.cpp | 6 +-
ggml/src/ggml-backend-reg.cpp | 6 +-
...
@@ -25,34 +24,13 @@ adds support for the llama 3.2 vision architecture
...
@@ -25,34 +24,13 @@ adds support for the llama 3.2 vision architecture
src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++-
src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++-
src/llama-model.h | 12 ++
src/llama-model.h | 12 ++
src/llama-quant.cpp | 4 +-
src/llama-quant.cpp | 4 +-
20
files changed, 47
5
insertions(+), 2
2
deletions(-)
19
files changed, 47
3
insertions(+), 2
1
deletions(-)
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 3d566475..654d1358 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -106,7 +106,7 @@
struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -118,6 +118,7 @@
struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index
03a22cbb..5eb40bcd
100644
index
c00d16ae..bab027b5
100644
--- a/examples/llava/llava.cpp
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -45
6
,7 +45
6
,7 @@
struct llava_embd_batch {
@@ -45
7
,7 +45
7
,7 @@
struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
std::vector<int8_t> logits;
llama_batch batch;
llama_batch batch;
...
@@ -61,7 +39,7 @@ index 03a22cbb..5eb40bcd 100644
...
@@ -61,7 +39,7 @@ index 03a22cbb..5eb40bcd 100644
pos .resize(n_tokens);
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
seq_ids .resize(n_tokens + 1);
@@ -46
8
,6 +46
8
,7 @@
struct llava_embd_batch {
@@ -46
9
,6 +46
9
,7 @@
struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*embd =*/ embd,
...
@@ -69,7 +47,7 @@ index 03a22cbb..5eb40bcd 100644
...
@@ -69,7 +47,7 @@ index 03a22cbb..5eb40bcd 100644
/*pos =*/ pos.data(),
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*seq_id =*/ seq_ids.data(),
@@ -49
1
,7 +49
2
,7 @@
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
@@ -49
2
,7 +49
3
,7 @@
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
n_eval = n_batch;
}
}
float * embd = image_embed->embed+i*n_embd;
float * embd = image_embed->embed+i*n_embd;
...
@@ -79,19 +57,19 @@ index 03a22cbb..5eb40bcd 100644
...
@@ -79,19 +57,19 @@ index 03a22cbb..5eb40bcd 100644
LOG_ERR("%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return false;
return false;
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index
3fd5bebc..f0cec596
100644
index
7081fd73..c14ac501
100644
--- a/examples/llava/mtmd.cpp
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -
233
,7 +
233
,7 @@
struct decode_embd_batch {
@@ -
476
,7 +
476
,7 @@
struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
std::vector<int8_t> logits;
llama_batch batch;
llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens,
llama_pos pos_0, llama_seq_id seq_i
d) {
- decode_embd_batch(float * embd, int32_t n_tokens,
int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_emb
d) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id)
: n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd)
{
pos .resize(n_tokens);
pos .resize(n_tokens
* n_pos_per_embd
);
n_seq_id.resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
seq_ids .resize(n_tokens + 1);
@@ -
245
,6 +
245
,7 @@
struct decode_embd_batch {
@@ -
487
,6 +
487
,7 @@
struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*embd =*/ embd,
...
@@ -99,16 +77,16 @@ index 3fd5bebc..f0cec596 100644
...
@@ -99,16 +77,16 @@ index 3fd5bebc..f0cec596 100644
/*pos =*/ pos.data(),
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*seq_id =*/ seq_ids.data(),
@@ -
311
,7 +
312
,8 @@
int32_t mtmd_helper_eval(mtmd_context * ctx,
@@ -
610
,7 +
611
,8 @@
int32_t mtmd_helper_eval(mtmd_context * ctx,
int32_t i_batch = 0;
int32_t n_
tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get())
;
int32_t n_
img_batches = GGML_PAD(n_tokens, n_batch) / n_batch
;
float * embd = mtmd_get_output_embd(ctx);
float * embd = mtmd_get_output_embd(ctx);
- decode_embd_batch batch_
img
(embd, n_tokens, n_p
ast, 0
);
- decode_embd_batch batch_
embd
(embd, n_tokens, n_p
os_per_embd, n_mmproj_embd
);
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
+ decode_embd_batch batch_
img
(embd, n_embd, n_tokens, n_past, 0);
+ decode_embd_batch batch_
embd
(embd, n_embd, n_tokens, n_past, 0);
int64_t t1 = ggml_time_ms();
ret = llama_decode(lctx, batch_img.batch
);
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get()
);
if (ret != 0) {
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 405d8e31..82ae1b5b 100644
index 405d8e31..82ae1b5b 100644
--- a/ggml/src/ggml-backend-reg.cpp
--- a/ggml/src/ggml-backend-reg.cpp
...
@@ -127,10 +105,10 @@ index 405d8e31..82ae1b5b 100644
...
@@ -127,10 +105,10 @@ index 405d8e31..82ae1b5b 100644
register_backend(ggml_backend_rpc_reg());
register_backend(ggml_backend_rpc_reg());
#endif
#endif
diff --git a/include/llama.h b/include/llama.h
diff --git a/include/llama.h b/include/llama.h
index
5657fbf0..f91896e4
100644
index
06c56395..f1628e88
100644
--- a/include/llama.h
--- a/include/llama.h
+++ b/include/llama.h
+++ b/include/llama.h
@@ -25
5
,6 +25
5
,7 @@
extern "C" {
@@ -25
6
,6 +25
6
,7 @@
extern "C" {
llama_token * token;
llama_token * token;
float * embd;
float * embd;
...
@@ -138,7 +116,7 @@ index 5657fbf0..f91896e4 100644
...
@@ -138,7 +116,7 @@ index 5657fbf0..f91896e4 100644
llama_pos * pos;
llama_pos * pos;
int32_t * n_seq_id;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
llama_seq_id ** seq_id;
@@ -35
7
,6 +35
8
,7 @@
extern "C" {
@@ -35
8
,6 +35
9
,7 @@
extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
bool no_perf; // whether to measure performance timings
...
@@ -146,7 +124,7 @@ index 5657fbf0..f91896e4 100644
...
@@ -146,7 +124,7 @@ index 5657fbf0..f91896e4 100644
// Abort callback
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
// if it returns true, execution of llama_decode() will be aborted
@@ -45
8
,6 +46
0
,10 @@
extern "C" {
@@ -45
9
,6 +46
1
,10 @@
extern "C" {
struct llama_context_params params),
struct llama_context_params params),
"use llama_init_from_model instead");
"use llama_init_from_model instead");
...
@@ -158,7 +136,7 @@ index 5657fbf0..f91896e4 100644
...
@@ -158,7 +136,7 @@ index 5657fbf0..f91896e4 100644
LLAMA_API void llama_free(struct llama_context * ctx);
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index
f754bc8f..0568565f
100644
index
5ab3f572..eb7b5325
100644
--- a/src/llama-arch.cpp
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@
@@ -6,6 +6,7 @@
...
@@ -169,7 +147,7 @@ index f754bc8f..0568565f 100644
...
@@ -169,7 +147,7 @@ index f754bc8f..0568565f 100644
{ LLM_ARCH_LLAMA4, "llama4" },
{ LLM_ARCH_LLAMA4, "llama4" },
{ LLM_ARCH_DECI, "deci" },
{ LLM_ARCH_DECI, "deci" },
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_FALCON, "falcon" },
@@ -14
2
,6 +14
3
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -14
4
,6 +14
5
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
...
@@ -177,7 +155,7 @@ index f754bc8f..0568565f 100644
...
@@ -177,7 +155,7 @@ index f754bc8f..0568565f 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -27
1
,6 +27
3
,40 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -27
3
,6 +27
5
,40 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
},
},
...
@@ -218,7 +196,7 @@ index f754bc8f..0568565f 100644
...
@@ -218,7 +196,7 @@ index f754bc8f..0568565f 100644
{
{
LLM_ARCH_DECI,
LLM_ARCH_DECI,
{
{
@@ -1
68
1,6 +17
1
7,14 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -1
70
1,6 +17
3
7,14 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
...
@@ -234,7 +212,7 @@ index f754bc8f..0568565f 100644
...
@@ -234,7 +212,7 @@ index f754bc8f..0568565f 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
diff --git a/src/llama-arch.h b/src/llama-arch.h
index
439aaeab..6a989034
100644
index
525c1b7d..bc8a4f0b
100644
--- a/src/llama-arch.h
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -11,6 +11,7 @@
@@ -11,6 +11,7 @@
...
@@ -245,7 +223,7 @@ index 439aaeab..6a989034 100644
...
@@ -245,7 +223,7 @@ index 439aaeab..6a989034 100644
LLM_ARCH_DECI,
LLM_ARCH_DECI,
LLM_ARCH_FALCON,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
LLM_ARCH_BAICHUAN,
@@ -14
6
,6 +14
7
,7 @@
enum llm_kv {
@@ -14
8
,6 +14
9
,7 @@
enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
...
@@ -253,7 +231,7 @@ index 439aaeab..6a989034 100644
...
@@ -253,7 +231,7 @@ index 439aaeab..6a989034 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -34
7
,6 +3
49
,14 @@
enum llm_tensor {
@@ -34
9
,6 +3
51
,14 @@
enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
LLM_TENSOR_BSKCN_TV,
...
@@ -297,10 +275,10 @@ index 01d5ca57..8682b0e6 100644
...
@@ -297,10 +275,10 @@ index 01d5ca57..8682b0e6 100644
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
}
}
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index
32f59819..0343ba8a
100644
index
9c1fe93f..cd06ad91
100644
--- a/src/llama-context.cpp
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -8
62
,7 +8
62
,7 @@
float * llama_context::get_logits_ith(int32_t i) {
@@ -8
51
,7 +8
51
,7 @@
float * llama_context::get_logits_ith(int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
}
}
...
@@ -309,7 +287,7 @@ index 32f59819..0343ba8a 100644
...
@@ -309,7 +287,7 @@ index 32f59819..0343ba8a 100644
} catch (const std::exception & err) {
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
#ifndef NDEBUG
@@ -9
83
,6 +9
83
,10 @@
void llama_context::set_warmup(bool value) {
@@ -9
72
,6 +9
72
,10 @@
void llama_context::set_warmup(bool value) {
cparams.warmup = value;
cparams.warmup = value;
}
}
...
@@ -320,7 +298,7 @@ index 32f59819..0343ba8a 100644
...
@@ -320,7 +298,7 @@ index 32f59819..0343ba8a 100644
void llama_context::set_adapter_lora(
void llama_context::set_adapter_lora(
llama_adapter_lora * adapter,
llama_adapter_lora * adapter,
float scale) {
float scale) {
@@ -10
58
,7 +10
62
,7 @@
int llama_context::encode(llama_batch & inp_batch) {
@@ -10
47
,7 +10
51
,7 @@
int llama_context::encode(llama_batch & inp_batch) {
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd;
...
@@ -329,7 +307,7 @@ index 32f59819..0343ba8a 100644
...
@@ -329,7 +307,7 @@ index 32f59819..0343ba8a 100644
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@@ -11
9
8,10 +1
202
,9 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -118
7
,10 +1
191
,9 @@
int llama_context::decode(llama_batch & inp_batch) {
const llama_batch & batch = batch_allocr.batch;
const llama_batch & batch = batch_allocr.batch;
...
@@ -341,7 +319,7 @@ index 32f59819..0343ba8a 100644
...
@@ -341,7 +319,7 @@ index 32f59819..0343ba8a 100644
const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd;
@@ -12
49
,7 +12
52
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -12
38
,7 +12
41
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
const bool logits_all = n_outputs_all == n_tokens_all;
const bool logits_all = n_outputs_all == n_tokens_all;
...
@@ -350,7 +328,7 @@ index 32f59819..0343ba8a 100644
...
@@ -350,7 +328,7 @@ index 32f59819..0343ba8a 100644
/* simple_split */ !kv_self->recurrent,
/* simple_split */ !kv_self->recurrent,
/* logits_all */ logits_all);
/* logits_all */ logits_all);
@@ -14
83
,12 +14
86
,11 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -14
72
,12 +14
75
,11 @@
int llama_context::decode(llama_batch & inp_batch) {
int32_t llama_context::output_reserve(int32_t n_outputs) {
int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams;
const auto & hparams = model.hparams;
...
@@ -364,7 +342,7 @@ index 32f59819..0343ba8a 100644
...
@@ -364,7 +342,7 @@ index 32f59819..0343ba8a 100644
const auto n_embd = hparams.n_embd;
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
// TODO: use a per-batch flag for logits presence instead
@@ -155
8
,7 +15
60
,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
@@ -15
4
5,7 +15
47
,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
void llama_context::output_reorder() {
void llama_context::output_reorder() {
auto & out_ids = sbatch.out_ids;
auto & out_ids = sbatch.out_ids;
if (!out_ids.empty()) {
if (!out_ids.empty()) {
...
@@ -373,7 +351,7 @@ index 32f59819..0343ba8a 100644
...
@@ -373,7 +351,7 @@ index 32f59819..0343ba8a 100644
const uint32_t n_embd = model.hparams.n_embd;
const uint32_t n_embd = model.hparams.n_embd;
GGML_ASSERT((size_t) n_outputs == out_ids.size());
GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -20
6
5,7 +20
67
,7 @@
size_t llama_context::state_write_data(llama_io_write_i & io) {
@@ -205
2
,7 +20
54
,7 @@
size_t llama_context::state_write_data(llama_io_write_i & io) {
{
{
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
...
@@ -382,7 +360,7 @@ index 32f59819..0343ba8a 100644
...
@@ -382,7 +360,7 @@ index 32f59819..0343ba8a 100644
io.write(&logits_size, sizeof(logits_size));
io.write(&logits_size, sizeof(logits_size));
@@ -22
48
,6 +22
50
,7 @@
llama_context_params llama_context_default_params() {
@@ -22
35
,6 +22
37
,7 @@
llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true,
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
/*.no_perf =*/ true,
...
@@ -390,7 +368,7 @@ index 32f59819..0343ba8a 100644
...
@@ -390,7 +368,7 @@ index 32f59819..0343ba8a 100644
/*.abort_callback =*/ nullptr,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
};
@@ -23
75
,6 +23
78
,10 @@
void llama_set_warmup(llama_context * ctx, bool warmup) {
@@ -23
62
,6 +23
65
,10 @@
void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx->set_warmup(warmup);
ctx->set_warmup(warmup);
}
}
...
@@ -402,7 +380,7 @@ index 32f59819..0343ba8a 100644
...
@@ -402,7 +380,7 @@ index 32f59819..0343ba8a 100644
ctx->synchronize();
ctx->synchronize();
}
}
diff --git a/src/llama-context.h b/src/llama-context.h
diff --git a/src/llama-context.h b/src/llama-context.h
index
04facb54..baa03276
100644
index
5457f077..a50c4afa
100644
--- a/src/llama-context.h
--- a/src/llama-context.h
+++ b/src/llama-context.h
+++ b/src/llama-context.h
@@ -65,6 +65,7 @@
struct llama_context {
@@ -65,6 +65,7 @@
struct llama_context {
...
@@ -426,10 +404,10 @@ index 30e550f0..85ad91b9 100644
...
@@ -426,10 +404,10 @@ index 30e550f0..85ad91b9 100644
enum llama_pooling_type pooling_type;
enum llama_pooling_type pooling_type;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index
a85e9728..d740c120
100644
index
fabb9ca2..b67216a4
100644
--- a/src/llama-graph.cpp
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -5
4
6,6 +5
4
6,12 @@
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
@@ -56
0
,6 +56
0
,12 @@
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
}
}
}
}
...
@@ -442,7 +420,7 @@ index a85e9728..d740c120 100644
...
@@ -442,7 +420,7 @@ index a85e9728..d740c120 100644
//
//
// llm_graph_context
// llm_graph_context
//
//
@@ -15
06
,6 +15
12
,25 @@
llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
@@ -15
32
,6 +15
38
,25 @@
llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
}
}
...
@@ -469,7 +447,7 @@ index a85e9728..d740c120 100644
...
@@ -469,7 +447,7 @@ index a85e9728..d740c120 100644
llm_graph_input_attn_cross * inp,
llm_graph_input_attn_cross * inp,
ggml_cgraph * gf,
ggml_cgraph * gf,
diff --git a/src/llama-graph.h b/src/llama-graph.h
diff --git a/src/llama-graph.h b/src/llama-graph.h
index d
192dc14..260a2af2
100644
index d
0c8d321..0fe18150
100644
--- a/src/llama-graph.h
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -86,6 +86,7 @@
public:
@@ -86,6 +86,7 @@
public:
...
@@ -480,7 +458,7 @@ index d192dc14..260a2af2 100644
...
@@ -480,7 +458,7 @@ index d192dc14..260a2af2 100644
};
};
class llm_graph_input_pos : public llm_graph_input_i {
class llm_graph_input_pos : public llm_graph_input_i {
@@ -28
5
,6 +28
6
,16 @@
public:
@@ -28
3
,6 +28
4
,16 @@
public:
const llama_cross * cross = nullptr;
const llama_cross * cross = nullptr;
};
};
...
@@ -497,7 +475,7 @@ index d192dc14..260a2af2 100644
...
@@ -497,7 +475,7 @@ index d192dc14..260a2af2 100644
//
//
// llm_graph_result
// llm_graph_result
//
//
@@ -49
3
,6 +50
4
,7 @@
struct llm_graph_context {
@@ -49
1
,6 +50
2
,7 @@
struct llm_graph_context {
ggml_tensor * build_inp_cls() const;
ggml_tensor * build_inp_cls() const;
ggml_tensor * build_inp_s_copy() const;
ggml_tensor * build_inp_s_copy() const;
ggml_tensor * build_inp_s_mask() const;
ggml_tensor * build_inp_s_mask() const;
...
@@ -518,7 +496,7 @@ index 8a667960..6a02de03 100644
...
@@ -518,7 +496,7 @@ index 8a667960..6a02de03 100644
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+}
+}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index
6e278945..c8a34d52
100644
index
48dce407..b6fc7e6d
100644
--- a/src/llama-hparams.h
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -2,6 +2,8 @@
@@ -2,6 +2,8 @@
...
@@ -546,7 +524,7 @@ index 6e278945..c8a34d52 100644
...
@@ -546,7 +524,7 @@ index 6e278945..c8a34d52 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_q = 0;
@@ -15
8
,6 +16
2
,9 @@
struct llama_hparams {
@@ -15
9
,6 +16
3
,9 @@
struct llama_hparams {
// Block skip connection
// Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const;
bool n_bskcn(uint32_t n, uint32_t il) const;
...
@@ -593,10 +571,10 @@ index a012aeae..2e11507d 100644
...
@@ -593,10 +571,10 @@ index a012aeae..2e11507d 100644
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str());
const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index
aba4281
9..d0
51696c
100644
index
572378c
9..
9
d0
99f11
100644
--- a/src/llama-model.cpp
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -4
19
,6 +4
19
,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -4
23
,6 +4
23
,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
// get general kv
// get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
...
@@ -604,7 +582,7 @@ index aba42819..d051696c 100644
...
@@ -604,7 +582,7 @@ index aba42819..d051696c 100644
// everything past this point is not vocab-related
// everything past this point is not vocab-related
if (hparams.vocab_only) {
if (hparams.vocab_only) {
@@ -43
0
,6 +43
1
,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -43
4
,6 +43
5
,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
...
@@ -612,7 +590,7 @@ index aba42819..d051696c 100644
...
@@ -612,7 +590,7 @@ index aba42819..d051696c 100644
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@@ -45
3
,9 +45
5
,11 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -45
7
,9 +45
9
,11 @@
void llama_model::load_hparams(llama_model_loader & ml) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
...
@@ -624,7 +602,7 @@ index aba42819..d051696c 100644
...
@@ -624,7 +602,7 @@ index aba42819..d051696c 100644
// n_head_kv is optional, default to n_head
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -5
08
,7 +51
2
,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -5
12
,7 +51
6
,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
...
@@ -633,7 +611,7 @@ index aba42819..d051696c 100644
...
@@ -633,7 +611,7 @@ index aba42819..d051696c 100644
if (hparams.n_rot != hparams.n_embd_head_k) {
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
}
@@ -57
1
,6 +57
5
,16 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -57
5
,6 +57
9
,16 @@
void llama_model::load_hparams(llama_model_loader & ml) {
hparams.use_kq_norm = false;
hparams.use_kq_norm = false;
}
}
} break;
} break;
...
@@ -650,7 +628,7 @@ index aba42819..d051696c 100644
...
@@ -650,7 +628,7 @@ index aba42819..d051696c 100644
case LLM_ARCH_DECI:
case LLM_ARCH_DECI:
{
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -15
50
,7 +156
4
,7 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -15
62
,7 +15
7
6,7 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff();
const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa;
const int64_t n_embd_gqa = n_embd_v_gqa;
...
@@ -659,7 +637,7 @@ index aba42819..d051696c 100644
...
@@ -659,7 +637,7 @@ index aba42819..d051696c 100644
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot;
const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert;
const int64_t n_expert = hparams.n_expert;
@@ -18
03
,6 +18
17
,52 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -18
15
,6 +18
29
,52 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
}
}
} break;
} break;
...
@@ -712,7 +690,7 @@ index aba42819..d051696c 100644
...
@@ -712,7 +690,7 @@ index aba42819..d051696c 100644
case LLM_ARCH_DECI:
case LLM_ARCH_DECI:
{
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4
683
,6 +47
43
,246 @@
struct llm_build_llama : public llm_graph_context {
@@ -4
707
,6 +47
67
,246 @@
struct llm_build_llama : public llm_graph_context {
}
}
};
};
...
@@ -959,7 +937,7 @@ index aba42819..d051696c 100644
...
@@ -959,7 +937,7 @@ index aba42819..d051696c 100644
struct llm_build_deci : public llm_graph_context {
struct llm_build_deci : public llm_graph_context {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -130
17
,6 +133
17
,10 @@
llm_graph_result_ptr llama_model::build_graph(
@@ -130
63
,6 +133
63
,10 @@
llm_graph_result_ptr llama_model::build_graph(
{
{
llm = std::make_unique<llm_build_llama>(*this, params, gf);
llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break;
} break;
...
@@ -970,7 +948,7 @@ index aba42819..d051696c 100644
...
@@ -970,7 +948,7 @@ index aba42819..d051696c 100644
case LLM_ARCH_DECI:
case LLM_ARCH_DECI:
{
{
llm = std::make_unique<llm_build_deci>(*this, params, gf);
llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -13
377
,6 +13
681
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -13
424
,6 +13
728
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA4:
case LLM_ARCH_LLAMA4:
...
@@ -979,7 +957,7 @@ index aba42819..d051696c 100644
...
@@ -979,7 +957,7 @@ index aba42819..d051696c 100644
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
diff --git a/src/llama-model.h b/src/llama-model.h
index
5865d5e9..72bab5be
100644
index
856e6042..6be91282
100644
--- a/src/llama-model.h
--- a/src/llama-model.h
+++ b/src/llama-model.h
+++ b/src/llama-model.h
@@ -11,6 +11,7 @@
@@ -11,6 +11,7 @@
...
@@ -990,15 +968,15 @@ index 5865d5e9..72bab5be 100644
...
@@ -990,15 +968,15 @@ index 5865d5e9..72bab5be 100644
struct llama_cparams;
struct llama_cparams;
struct llama_ubatch;
struct llama_ubatch;
@@ -7
0
,6 +7
1
,7 @@
enum llm_type {
@@ -7
3
,6 +7
4
,7 @@
enum llm_type {
LLM_TYPE_40B,
LLM_TYPE_40B,
LLM_TYPE_65B,
LLM_TYPE_65B,
LLM_TYPE_70B,
LLM_TYPE_70B,
+ LLM_TYPE_90B,
+ LLM_TYPE_90B,
LLM_TYPE_236B,
LLM_TYPE_236B,
LLM_TYPE_290B,
LLM_TYPE_314B,
LLM_TYPE_314B,
LLM_TYPE_671B,
@@ -314,6 +316,16 @@
struct llama_layer {
@@ -310,6 +312,16 @@
struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
struct ggml_tensor * bskcn_tv = nullptr;
...
...
llama/patches/0007-add-unpad-operator.patch
View file @
20c5fd39
...
@@ -18,10 +18,10 @@ adds the unpad operator to GGML
...
@@ -18,10 +18,10 @@ adds the unpad operator to GGML
10 files changed, 223 insertions(+), 2 deletions(-)
10 files changed, 223 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index
8fcc16df..d19fc167
100644
index
1b8603e7..53ef31b2
100644
--- a/ggml/include/ggml.h
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -48
8
,6 +48
8
,7 @@
extern "C" {
@@ -48
9
,6 +48
9
,7 @@
extern "C" {
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
GGML_OP_PAD_REFLECT_1D,
...
@@ -29,7 +29,7 @@ index 8fcc16df..d19fc167 100644
...
@@ -29,7 +29,7 @@ index 8fcc16df..d19fc167 100644
GGML_OP_ARANGE,
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
GGML_OP_ARGSORT,
@@ -17
5
7,6 +17
5
8,15 @@
extern "C" {
@@ -17
7
7,6 +17
7
8,15 @@
extern "C" {
int p0,
int p0,
int p1);
int p1);
...
@@ -46,10 +46,10 @@ index 8fcc16df..d19fc167 100644
...
@@ -46,10 +46,10 @@ index 8fcc16df..d19fc167 100644
// timesteps: [N,]
// timesteps: [N,]
// return: [N, dim]
// return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index
50400328..432942bf
100644
index
64405449..34624cca
100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -196
0
,6 +196
0
,10 @@
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
@@ -196
4
,6 +196
4
,10 @@
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
{
ggml_compute_forward_pad_reflect_1d(params, tensor);
ggml_compute_forward_pad_reflect_1d(params, tensor);
} break;
} break;
...
@@ -60,7 +60,7 @@ index 50400328..432942bf 100644
...
@@ -60,7 +60,7 @@ index 50400328..432942bf 100644
case GGML_OP_ARANGE:
case GGML_OP_ARANGE:
{
{
ggml_compute_forward_arange(params, tensor);
ggml_compute_forward_arange(params, tensor);
@@ -228
2
,6 +22
86
,7 @@
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
@@ -228
7
,6 +22
91
,7 @@
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE:
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_PAD_REFLECT_1D:
...
@@ -69,10 +69,10 @@ index 50400328..432942bf 100644
...
@@ -69,10 +69,10 @@ index 50400328..432942bf 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index
6050147b..66b8da68
100644
index
7413192b..becdae07
100644
--- a/ggml/src/ggml-cpu/ops.cpp
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6
531
,6 +6
531
,61 @@
void ggml_compute_forward_pad_reflect_1d(
@@ -6
703
,6 +6
703
,61 @@
void ggml_compute_forward_pad_reflect_1d(
}
}
}
}
...
@@ -135,10 +135,10 @@ index 6050147b..66b8da68 100644
...
@@ -135,10 +135,10 @@ index 6050147b..66b8da68 100644
static void ggml_compute_forward_arange_f32(
static void ggml_compute_forward_arange_f32(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index
410a3720..3eca1cf8
100644
index
dc081b9e..a7125555
100644
--- a/ggml/src/ggml-cpu/ops.h
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -7
1
,6 +7
1
,7 @@
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
@@ -7
2
,6 +7
2
,7 @@
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
...
@@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644
...
@@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
31750b6f..0fef9522
100644
index
04ce764e..491acccb
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -22
46
,6 +22
46
,9 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
@@ -22
23
,6 +22
23
,9 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD:
case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst);
ggml_cuda_op_pad(ctx, dst);
break;
break;
...
@@ -160,7 +160,7 @@ index 31750b6f..0fef9522 100644
...
@@ -160,7 +160,7 @@ index 31750b6f..0fef9522 100644
case GGML_OP_ARANGE:
case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst);
ggml_cuda_op_arange(ctx, dst);
break;
break;
@@ -3
222
,6 +32
25
,7 @@
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
@@ -3
197
,6 +32
00
,7 @@
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_UPSCALE:
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_PAD:
case GGML_OP_PAD:
...
@@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644
...
@@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index
12886cd3..b2e95a6
6 100644
index
425524d0..112abef
6 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -341,6 +341,7 @@
static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
@@ -341,6 +341,7 @@
static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
...
@@ -244,7 +244,7 @@ index 12886cd3..b2e95a66 100644
...
@@ -244,7 +244,7 @@ index 12886cd3..b2e95a66 100644
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -1
020
,6 +1
021
,7 @@
@implementation GGMLMetalClass
@@ -1
277
,6 +1
278
,7 @@
@implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
...
@@ -252,7 +252,7 @@ index 12886cd3..b2e95a66 100644
...
@@ -252,7 +252,7 @@ index 12886cd3..b2e95a66 100644
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1
384
,6 +1
386
,7 @@
static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
@@ -1
647
,6 +1
649
,7 @@
static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D:
case GGML_OP_POOL_2D:
case GGML_OP_PAD:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_PAD_REFLECT_1D:
...
@@ -260,7 +260,7 @@ index 12886cd3..b2e95a66 100644
...
@@ -260,7 +260,7 @@ index 12886cd3..b2e95a66 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU:
case GGML_OP_LEAKY_RELU:
@@ -
3731,6 +3734
,36 @@
static
void
ggml_metal_encode_node(
@@ -
4047,6 +4050
,36 @@
static
bool
ggml_metal_encode_node(
const int nth = MIN(1024, ne0);
const int nth = MIN(1024, ne0);
...
@@ -298,7 +298,7 @@ index 12886cd3..b2e95a66 100644
...
@@ -298,7 +298,7 @@ index 12886cd3..b2e95a66 100644
} break;
} break;
case GGML_OP_ARANGE:
case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index
8d6e99e6..71f0f97
f 100644
index
9f4147e9..6ceb3ce
f 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2975,6 +2975,51 @@
kernel void kernel_pad_reflect_1d_f32(
@@ -2975,6 +2975,51 @@
kernel void kernel_pad_reflect_1d_f32(
...
@@ -354,10 +354,10 @@ index 8d6e99e6..71f0f97f 100644
...
@@ -354,10 +354,10 @@ index 8d6e99e6..71f0f97f 100644
device char * dst,
device char * dst,
constant ggml_metal_kargs_arange & args,
constant ggml_metal_kargs_arange & args,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index
950772c7..2276b631
100644
index
7654ae17..3c57aff8
100644
--- a/ggml/src/ggml.c
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -9
6
3,6 +9
6
3,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -9
2
3,6 +9
2
3,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE",
"UPSCALE",
"PAD",
"PAD",
"PAD_REFLECT_1D",
"PAD_REFLECT_1D",
...
@@ -365,16 +365,16 @@ index 950772c7..2276b631 100644
...
@@ -365,16 +365,16 @@ index 950772c7..2276b631 100644
"ARANGE",
"ARANGE",
"TIMESTEP_EMBEDDING",
"TIMESTEP_EMBEDDING",
"ARGSORT",
"ARGSORT",
@@ -9
9
3,7 +9
9
4,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -9
5
3,7 +9
5
4,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
"OPT_STEP_ADAMW",
};
};
-static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
-static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
+static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
+static_assert(GGML_OP_COUNT == 8
3
, "GGML_OP_COUNT != 8
3
");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
"none",
@@ -10
57
,6 +10
58
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -10
18
,6 +10
19
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)",
"upscale(x)",
"pad(x)",
"pad(x)",
"pad_reflect_1d(x)",
"pad_reflect_1d(x)",
...
@@ -382,16 +382,16 @@ index 950772c7..2276b631 100644
...
@@ -382,16 +382,16 @@ index 950772c7..2276b631 100644
"arange(start, stop, step)",
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)",
"argsort(x)",
@@ -108
7
,7 +10
89
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -10
4
8,7 +10
50
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
"adamw(x)",
};
};
-static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
-static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
+static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
+static_assert(GGML_OP_COUNT == 8
3
, "GGML_OP_COUNT != 8
3
");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -42
62
,6 +42
64
,25 @@
struct ggml_tensor * ggml_pad_reflect_1d(
@@ -42
70
,6 +42
72
,25 @@
struct ggml_tensor * ggml_pad_reflect_1d(
return result;
return result;
}
}
...
...
llama/patches/0008-fix-deepseek-deseret-regex.patch
View file @
20c5fd39
...
@@ -12,7 +12,7 @@ regex
...
@@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-)
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a
35b498c..032019c9
100644
index a
9ee9f03..1306864e
100644
--- a/src/llama-vocab.cpp
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -296,7 +296,7 @@
struct llm_tokenizer_bpe : llm_tokenizer {
@@ -296,7 +296,7 @@
struct llm_tokenizer_bpe : llm_tokenizer {
...
...
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment