Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
544b6739
Unverified
Commit
544b6739
authored
Nov 06, 2025
by
Daniel Hiltgen
Committed by
GitHub
Nov 06, 2025
Browse files
ggml update to b6840 (#12791)
parent
c4ba257c
Changes
103
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
585 additions
and
135 deletions
+585
-135
Makefile.sync
Makefile.sync
+1
-1
llama/build-info.cpp
llama/build-info.cpp
+1
-1
llama/llama.cpp/common/json-schema-to-grammar.cpp
llama/llama.cpp/common/json-schema-to-grammar.cpp
+12
-12
llama/llama.cpp/src/llama-arch.cpp
llama/llama.cpp/src/llama-arch.cpp
+40
-0
llama/llama.cpp/src/llama-arch.h
llama/llama.cpp/src/llama-arch.h
+4
-0
llama/llama.cpp/src/llama-batch.h
llama/llama.cpp/src/llama-batch.h
+1
-1
llama/llama.cpp/src/llama-chat.cpp
llama/llama.cpp/src/llama-chat.cpp
+35
-2
llama/llama.cpp/src/llama-chat.h
llama/llama.cpp/src/llama-chat.h
+2
-0
llama/llama.cpp/src/llama-context.cpp
llama/llama.cpp/src/llama-context.cpp
+2
-1
llama/llama.cpp/src/llama-graph.cpp
llama/llama.cpp/src/llama-graph.cpp
+104
-43
llama/llama.cpp/src/llama-graph.h
llama/llama.cpp/src/llama-graph.h
+7
-3
llama/llama.cpp/src/llama-hparams.h
llama/llama.cpp/src/llama-hparams.h
+2
-0
llama/llama.cpp/src/llama-model.cpp
llama/llama.cpp/src/llama-model.cpp
+321
-46
llama/llama.cpp/src/llama-model.h
llama/llama.cpp/src/llama-model.h
+3
-0
llama/llama.cpp/src/llama-quant.cpp
llama/llama.cpp/src/llama-quant.cpp
+7
-1
llama/llama.cpp/src/llama-vocab.cpp
llama/llama.cpp/src/llama-vocab.cpp
+1
-0
llama/llama.cpp/src/llama.cpp
llama/llama.cpp/src/llama.cpp
+4
-0
llama/llama.cpp/tools/mtmd/clip-impl.h
llama/llama.cpp/tools/mtmd/clip-impl.h
+2
-0
llama/llama.cpp/tools/mtmd/clip.cpp
llama/llama.cpp/tools/mtmd/clip.cpp
+15
-3
llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
...gml-backend-malloc-and-free-using-the-same-compiler.patch
+21
-21
No files found.
Makefile.sync
View file @
544b6739
UPSTREAM
=
https://github.com/ggml-org/llama.cpp.git
WORKDIR
=
llama/vendor
FETCH_HEAD
=
7049736b2dd9011bf819e298b844ebbc4b5af
dc
9
FETCH_HEAD
=
3cfa9c3f125763305b4226bc032f1954f08990
dc
.PHONY
:
help
help
:
...
...
llama/build-info.cpp
View file @
544b6739
int
LLAMA_BUILD_NUMBER
=
0
;
char
const
*
LLAMA_COMMIT
=
"
7049736b2dd9011bf819e298b844ebbc4b5af
dc
9
"
;
char
const
*
LLAMA_COMMIT
=
"
3cfa9c3f125763305b4226bc032f1954f08990
dc"
;
char
const
*
LLAMA_COMPILER
=
""
;
char
const
*
LLAMA_BUILD_TARGET
=
""
;
llama/llama.cpp/common/json-schema-to-grammar.cpp
View file @
544b6739
...
...
@@ -41,9 +41,9 @@ static std::string build_repetition(const std::string & item_rule, int min_items
return
result
;
}
static
void
_build_min_max_int
(
int
min_value
,
int
max_value
,
std
::
stringstream
&
out
,
int
decimals_left
=
16
,
bool
top_level
=
true
)
{
auto
has_min
=
min_value
!=
std
::
numeric_limits
<
int
>::
min
();
auto
has_max
=
max_value
!=
std
::
numeric_limits
<
int
>::
max
();
static
void
_build_min_max_int
(
int
64_t
min_value
,
int
64_t
max_value
,
std
::
stringstream
&
out
,
int
decimals_left
=
16
,
bool
top_level
=
true
)
{
auto
has_min
=
min_value
!=
std
::
numeric_limits
<
int
64_t
>::
min
();
auto
has_max
=
max_value
!=
std
::
numeric_limits
<
int
64_t
>::
max
();
auto
digit_range
=
[
&
](
char
from
,
char
to
)
{
out
<<
"["
;
...
...
@@ -159,7 +159,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
if
(
has_min
)
{
if
(
min_value
<
0
)
{
out
<<
"
\"
-
\"
("
;
_build_min_max_int
(
std
::
numeric_limits
<
int
>::
min
(),
-
min_value
,
out
,
decimals_left
,
/* top_level= */
false
);
_build_min_max_int
(
std
::
numeric_limits
<
int
64_t
>::
min
(),
-
min_value
,
out
,
decimals_left
,
/* top_level= */
false
);
out
<<
") | [0] | [1-9] "
;
more_digits
(
0
,
decimals_left
-
1
);
}
else
if
(
min_value
==
0
)
{
...
...
@@ -194,7 +194,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
}
digit_range
(
c
,
c
);
out
<<
" ("
;
_build_min_max_int
(
std
::
sto
i
(
min_s
.
substr
(
1
)),
std
::
numeric_limits
<
int
>::
max
(),
out
,
less_decimals
,
/* top_level= */
false
);
_build_min_max_int
(
std
::
sto
ll
(
min_s
.
substr
(
1
)),
std
::
numeric_limits
<
int
64_t
>::
max
(),
out
,
less_decimals
,
/* top_level= */
false
);
out
<<
")"
;
if
(
c
<
'9'
)
{
out
<<
" | "
;
...
...
@@ -216,7 +216,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
_build_min_max_int
(
0
,
max_value
,
out
,
decimals_left
,
/* top_level= */
true
);
}
else
{
out
<<
"
\"
-
\"
("
;
_build_min_max_int
(
-
max_value
,
std
::
numeric_limits
<
int
>::
max
(),
out
,
decimals_left
,
/* top_level= */
false
);
_build_min_max_int
(
-
max_value
,
std
::
numeric_limits
<
int
64_t
>::
max
(),
out
,
decimals_left
,
/* top_level= */
false
);
out
<<
")"
;
}
return
;
...
...
@@ -925,17 +925,17 @@ public:
int
max_len
=
schema
.
contains
(
"maxLength"
)
?
schema
[
"maxLength"
].
get
<
int
>
()
:
std
::
numeric_limits
<
int
>::
max
();
return
_add_rule
(
rule_name
,
"
\"\\\"\"
"
+
build_repetition
(
char_rule
,
min_len
,
max_len
)
+
"
\"\\\"\"
space"
);
}
else
if
(
schema_type
==
"integer"
&&
(
schema
.
contains
(
"minimum"
)
||
schema
.
contains
(
"exclusiveMinimum"
)
||
schema
.
contains
(
"maximum"
)
||
schema
.
contains
(
"exclusiveMaximum"
)))
{
int
min_value
=
std
::
numeric_limits
<
int
>::
min
();
int
max_value
=
std
::
numeric_limits
<
int
>::
max
();
int
64_t
min_value
=
std
::
numeric_limits
<
int
64_t
>::
min
();
int
64_t
max_value
=
std
::
numeric_limits
<
int
64_t
>::
max
();
if
(
schema
.
contains
(
"minimum"
))
{
min_value
=
schema
[
"minimum"
].
get
<
int
>
();
min_value
=
schema
[
"minimum"
].
get
<
int
64_t
>
();
}
else
if
(
schema
.
contains
(
"exclusiveMinimum"
))
{
min_value
=
schema
[
"exclusiveMinimum"
].
get
<
int
>
()
+
1
;
min_value
=
schema
[
"exclusiveMinimum"
].
get
<
int
64_t
>
()
+
1
;
}
if
(
schema
.
contains
(
"maximum"
))
{
max_value
=
schema
[
"maximum"
].
get
<
int
>
();
max_value
=
schema
[
"maximum"
].
get
<
int
64_t
>
();
}
else
if
(
schema
.
contains
(
"exclusiveMaximum"
))
{
max_value
=
schema
[
"exclusiveMaximum"
].
get
<
int
>
()
-
1
;
max_value
=
schema
[
"exclusiveMaximum"
].
get
<
int
64_t
>
()
-
1
;
}
std
::
stringstream
out
;
out
<<
"("
;
...
...
llama/llama.cpp/src/llama-arch.cpp
View file @
544b6739
...
...
@@ -5,6 +5,7 @@
#include <map>
static
const
std
::
map
<
llm_arch
,
const
char
*>
LLM_ARCH_NAMES
=
{
{
LLM_ARCH_CLIP
,
"clip"
},
// dummy, only used by llama-quantize
{
LLM_ARCH_LLAMA
,
"llama"
},
{
LLM_ARCH_LLAMA4
,
"llama4"
},
{
LLM_ARCH_DECI
,
"deci"
},
...
...
@@ -85,6 +86,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{
LLM_ARCH_WAVTOKENIZER_DEC
,
"wavtokenizer-dec"
},
{
LLM_ARCH_PLM
,
"plm"
},
{
LLM_ARCH_BAILINGMOE
,
"bailingmoe"
},
{
LLM_ARCH_BAILINGMOE2
,
"bailingmoe2"
},
{
LLM_ARCH_DOTS1
,
"dots1"
},
{
LLM_ARCH_ARCEE
,
"arcee"
},
{
LLM_ARCH_ERNIE4_5
,
"ernie4_5"
},
...
...
@@ -135,6 +137,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{
LLM_KV_EXPERT_COUNT
,
"%s.expert_count"
},
{
LLM_KV_EXPERT_USED_COUNT
,
"%s.expert_used_count"
},
{
LLM_KV_EXPERT_SHARED_COUNT
,
"%s.expert_shared_count"
},
{
LLM_KV_EXPERT_GROUP_COUNT
,
"%s.expert_group_count"
},
{
LLM_KV_EXPERT_GROUP_USED_COUNT
,
"%s.expert_group_used_count"
},
{
LLM_KV_EXPERT_WEIGHTS_SCALE
,
"%s.expert_weights_scale"
},
{
LLM_KV_EXPERT_WEIGHTS_NORM
,
"%s.expert_weights_norm"
},
{
LLM_KV_EXPERT_GATING_FUNC
,
"%s.expert_gating_func"
},
...
...
@@ -277,6 +281,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
};
static
const
std
::
map
<
llm_arch
,
std
::
map
<
llm_tensor
,
const
char
*>>
LLM_TENSOR_NAMES
=
{
{
LLM_ARCH_CLIP
,
{},
},
{
LLM_ARCH_LLAMA
,
{
...
...
@@ -1961,6 +1969,38 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{
LLM_TENSOR_FFN_UP_SHEXP
,
"blk.%d.ffn_up_shexp"
},
},
},
{
LLM_ARCH_BAILINGMOE2
,
{
{
LLM_TENSOR_TOKEN_EMBD
,
"token_embd"
},
{
LLM_TENSOR_OUTPUT_NORM
,
"output_norm"
},
{
LLM_TENSOR_OUTPUT
,
"output"
},
{
LLM_TENSOR_ATTN_NORM
,
"blk.%d.attn_norm"
},
{
LLM_TENSOR_ATTN_Q_NORM
,
"blk.%d.attn_q_norm"
},
{
LLM_TENSOR_ATTN_K_NORM
,
"blk.%d.attn_k_norm"
},
{
LLM_TENSOR_ATTN_QKV
,
"blk.%d.attn_qkv"
},
{
LLM_TENSOR_ATTN_OUT
,
"blk.%d.attn_output"
},
{
LLM_TENSOR_FFN_GATE_INP
,
"blk.%d.ffn_gate_inp"
},
{
LLM_TENSOR_FFN_EXP_PROBS_B
,
"blk.%d.exp_probs_b"
},
{
LLM_TENSOR_FFN_NORM
,
"blk.%d.ffn_norm"
},
{
LLM_TENSOR_FFN_GATE
,
"blk.%d.ffn_gate"
},
{
LLM_TENSOR_FFN_DOWN
,
"blk.%d.ffn_down"
},
{
LLM_TENSOR_FFN_UP
,
"blk.%d.ffn_up"
},
{
LLM_TENSOR_FFN_GATE_EXPS
,
"blk.%d.ffn_gate_exps"
},
{
LLM_TENSOR_FFN_DOWN_EXPS
,
"blk.%d.ffn_down_exps"
},
{
LLM_TENSOR_FFN_UP_EXPS
,
"blk.%d.ffn_up_exps"
},
{
LLM_TENSOR_FFN_GATE_SHEXP
,
"blk.%d.ffn_gate_shexp"
},
{
LLM_TENSOR_FFN_DOWN_SHEXP
,
"blk.%d.ffn_down_shexp"
},
{
LLM_TENSOR_FFN_UP_SHEXP
,
"blk.%d.ffn_up_shexp"
},
{
LLM_TENSOR_NEXTN_EH_PROJ
,
"blk.%d.nextn.eh_proj"
},
{
LLM_TENSOR_NEXTN_EMBED_TOKENS
,
"blk.%d.nextn.embed_tokens"
},
{
LLM_TENSOR_NEXTN_ENORM
,
"blk.%d.nextn.enorm"
},
{
LLM_TENSOR_NEXTN_HNORM
,
"blk.%d.nextn.hnorm"
},
{
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD
,
"blk.%d.nextn.shared_head_head"
},
{
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM
,
"blk.%d.nextn.shared_head_norm"
},
{
LLM_TENSOR_LAYER_OUT_NORM
,
"blk.%d.layer_output_norm"
},
},
},
{
LLM_ARCH_DOTS1
,
{
...
...
llama/llama.cpp/src/llama-arch.h
View file @
544b6739
...
...
@@ -9,6 +9,7 @@
//
enum
llm_arch
{
LLM_ARCH_CLIP
,
LLM_ARCH_LLAMA
,
LLM_ARCH_LLAMA4
,
LLM_ARCH_DECI
,
...
...
@@ -89,6 +90,7 @@ enum llm_arch {
LLM_ARCH_WAVTOKENIZER_DEC
,
LLM_ARCH_PLM
,
LLM_ARCH_BAILINGMOE
,
LLM_ARCH_BAILINGMOE2
,
LLM_ARCH_DOTS1
,
LLM_ARCH_ARCEE
,
LLM_ARCH_ERNIE4_5
,
...
...
@@ -139,6 +141,8 @@ enum llm_kv {
LLM_KV_EXPERT_COUNT
,
LLM_KV_EXPERT_USED_COUNT
,
LLM_KV_EXPERT_SHARED_COUNT
,
LLM_KV_EXPERT_GROUP_COUNT
,
LLM_KV_EXPERT_GROUP_USED_COUNT
,
LLM_KV_EXPERT_WEIGHTS_SCALE
,
LLM_KV_EXPERT_WEIGHTS_NORM
,
LLM_KV_EXPERT_GATING_FUNC
,
...
...
llama/llama.cpp/src/llama-batch.h
View file @
544b6739
...
...
@@ -123,7 +123,7 @@ private:
uint32_t
n_seq_max
;
uint32_t
n_outputs
;
std
::
array
<
llama_seq_id
,
1
>
seq_id_0
=
{
0
};
// default sequence id
std
::
array
<
llama_seq_id
,
1
>
seq_id_0
=
{
{
0
}
};
// default sequence id
std
::
vector
<
llama_pos
>
pos
;
std
::
vector
<
int32_t
>
n_seq_id
;
...
...
llama/llama.cpp/src/llama-chat.cpp
View file @
544b6739
...
...
@@ -63,6 +63,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{
"megrez"
,
LLM_CHAT_TEMPLATE_MEGREZ
},
{
"yandex"
,
LLM_CHAT_TEMPLATE_YANDEX
},
{
"bailing"
,
LLM_CHAT_TEMPLATE_BAILING
},
{
"bailing-think"
,
LLM_CHAT_TEMPLATE_BAILING_THINK
},
{
"bailing2"
,
LLM_CHAT_TEMPLATE_BAILING2
},
{
"llama4"
,
LLM_CHAT_TEMPLATE_LLAMA4
},
{
"smolvlm"
,
LLM_CHAT_TEMPLATE_SMOLVLM
},
{
"hunyuan-moe"
,
LLM_CHAT_TEMPLATE_HUNYUAN_MOE
},
...
...
@@ -191,6 +193,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return
LLM_CHAT_TEMPLATE_YANDEX
;
}
else
if
(
tmpl_contains
(
"<role>ASSISTANT</role>"
)
&&
tmpl_contains
(
"'HUMAN'"
))
{
return
LLM_CHAT_TEMPLATE_BAILING
;
}
else
if
(
tmpl_contains
(
"<role>ASSISTANT</role>"
)
&&
tmpl_contains
(
"
\"
HUMAN
\"
"
)
&&
tmpl_contains
(
"<think>"
))
{
return
LLM_CHAT_TEMPLATE_BAILING_THINK
;
}
else
if
(
tmpl_contains
(
"<role>ASSISTANT</role>"
)
&&
tmpl_contains
(
"<role>HUMAN</role>"
)
&&
tmpl_contains
(
"<|role_end|>"
))
{
return
LLM_CHAT_TEMPLATE_BAILING2
;
}
else
if
(
tmpl_contains
(
"<|header_start|>"
)
&&
tmpl_contains
(
"<|header_end|>"
))
{
return
LLM_CHAT_TEMPLATE_LLAMA4
;
}
else
if
(
tmpl_contains
(
"<|endofuserprompt|>"
))
{
...
...
@@ -644,8 +650,8 @@ int32_t llm_chat_apply_template(
if
(
add_ass
)
{
ss
<<
" Ассистент:[SEP]"
;
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_BAILING
)
{
// Bailing (Ling) template
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_BAILING
||
tmpl
==
LLM_CHAT_TEMPLATE_BAILING_THINK
)
{
// Bailing (Ling
/Ring
) template
for
(
auto
message
:
chat
)
{
std
::
string
role
(
message
->
role
);
...
...
@@ -658,6 +664,33 @@ int32_t llm_chat_apply_template(
ss
<<
"<role>"
<<
role
<<
"</role>"
<<
message
->
content
;
}
if
(
add_ass
)
{
ss
<<
"<role>ASSISTANT</role>"
;
if
(
tmpl
==
LLM_CHAT_TEMPLATE_BAILING_THINK
)
{
ss
<<
"<think>"
;
}
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_BAILING2
)
{
// Bailing2 (Ling 2.0) template
bool
has_system
=
!
chat
.
empty
()
&&
std
::
string
(
chat
[
0
]
->
role
)
==
"system"
;
if
(
!
has_system
)
{
ss
<<
"<role>SYSTEM</role>detailed thinking off<|role_end|>"
;
}
for
(
auto
message
:
chat
)
{
std
::
string
role
(
message
->
role
);
if
(
role
==
"user"
)
{
role
=
"HUMAN"
;
}
else
{
std
::
transform
(
role
.
begin
(),
role
.
end
(),
role
.
begin
(),
::
toupper
);
}
ss
<<
"<role>"
<<
role
<<
"</role>"
<<
message
->
content
<<
"<|role_end|>"
;
}
if
(
add_ass
)
{
ss
<<
"<role>ASSISTANT</role>"
;
}
...
...
llama/llama.cpp/src/llama-chat.h
View file @
544b6739
...
...
@@ -42,6 +42,8 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_MEGREZ
,
LLM_CHAT_TEMPLATE_YANDEX
,
LLM_CHAT_TEMPLATE_BAILING
,
LLM_CHAT_TEMPLATE_BAILING_THINK
,
LLM_CHAT_TEMPLATE_BAILING2
,
LLM_CHAT_TEMPLATE_LLAMA4
,
LLM_CHAT_TEMPLATE_SMOLVLM
,
LLM_CHAT_TEMPLATE_DOTS1
,
...
...
llama/llama.cpp/src/llama-context.cpp
View file @
544b6739
...
...
@@ -2345,7 +2345,8 @@ llama_context * llama_init_from_model(
return
nullptr
;
}
if
(
params
.
pooling_type
!=
model
->
hparams
.
pooling_type
)
{
if
(
params
.
pooling_type
!=
LLAMA_POOLING_TYPE_UNSPECIFIED
&&
params
.
pooling_type
!=
model
->
hparams
.
pooling_type
)
{
//user-specified pooling-type is different from the model default
LLAMA_LOG_WARN
(
"%s: model default pooling_type is [%d], but [%d] was specified
\n
"
,
__func__
,
model
->
hparams
.
pooling_type
,
params
.
pooling_type
);
...
...
llama/llama.cpp/src/llama-graph.cpp
View file @
544b6739
...
...
@@ -261,12 +261,17 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
}
}
static
void
print_mask
(
float
*
data
,
int64_t
n_tokens
,
int64_t
n_kv
,
int64_t
n_swa
,
llama_swa_type
swa_type
)
{
static
void
print_mask
(
const
float
*
data
,
int64_t
n_tokens
,
int64_t
n_kv
,
int64_t
n_swa
,
llama_swa_type
swa_type
)
{
LLAMA_LOG_DEBUG
(
"%s: === Attention mask ===
\n
"
,
__func__
);
const
char
*
swa_type_str
=
(
swa_type
==
LLAMA_SWA_TYPE_NONE
)
?
"LLAMA_SWA_TYPE_NONE"
:
(
swa_type
==
LLAMA_SWA_TYPE_STANDARD
)
?
"LLAMA_SWA_TYPE_STANDARD"
:
(
swa_type
==
LLAMA_SWA_TYPE_CHUNKED
)
?
"LLAMA_SWA_TYPE_CHUNKED"
:
(
swa_type
==
LLAMA_SWA_TYPE_SYMMETRIC
)
?
"LLAMA_SWA_TYPE_SYMMETRIC"
:
"unknown"
;
const
char
*
swa_type_str
=
"unknown"
;
switch
(
swa_type
)
{
case
LLAMA_SWA_TYPE_NONE
:
swa_type_str
=
"LLAMA_SWA_TYPE_NONE"
;
break
;
case
LLAMA_SWA_TYPE_STANDARD
:
swa_type_str
=
"LLAMA_SWA_TYPE_STANDARD"
;
break
;
case
LLAMA_SWA_TYPE_CHUNKED
:
swa_type_str
=
"LLAMA_SWA_TYPE_CHUNKED"
;
break
;
case
LLAMA_SWA_TYPE_SYMMETRIC
:
swa_type_str
=
"LLAMA_SWA_TYPE_SYMMETRIC"
;
break
;
};
LLAMA_LOG_DEBUG
(
"%s: n_swa : %d, n_kv: %d, swq_type: %s
\n
"
,
__func__
,
(
int
)
n_swa
,
(
int
)
n_kv
,
swa_type_str
);
LLAMA_LOG_DEBUG
(
"%s: '0' = can attend, '∞' = masked
\n
"
,
__func__
);
LLAMA_LOG_DEBUG
(
"%s: Rows = query tokens, Columns = key/value tokens
\n\n
"
,
__func__
);
...
...
@@ -295,50 +300,67 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
const
int64_t
n_kv
=
ubatch
->
n_tokens
;
const
int64_t
n_tokens
=
ubatch
->
n_tokens
;
GGML_ASSERT
(
kq_mask
);
GGML_ASSERT
(
ggml_backend_buffer_is_host
(
kq_mask
->
buffer
));
float
*
data
=
(
float
*
)
kq_mask
->
data
;
// [TAG_NO_CACHE_ISWA]
GGML_ASSERT
(
hparams
.
swa_type
==
LLAMA_SWA_TYPE_NONE
&&
"TODO: implement"
);
for
(
int
h
=
0
;
h
<
1
;
++
h
)
{
for
(
int
i1
=
0
;
i1
<
n_tokens
;
++
i1
)
{
const
llama_seq_id
s1
=
ubatch
->
seq_id
[
i1
][
0
];
const
auto
fill_mask
=
[
&
](
float
*
data
,
int
n_swa
,
llama_swa_type
swa_type
)
{
for
(
int
h
=
0
;
h
<
1
;
++
h
)
{
for
(
int
i1
=
0
;
i1
<
n_tokens
;
++
i1
)
{
const
llama_seq_id
s1
=
ubatch
->
seq_id
[
i1
][
0
];
const
llama_pos
p1
=
ubatch
->
pos
[
i1
];
for
(
int
i0
=
0
;
i0
<
n_tokens
;
++
i0
)
{
float
f
=
-
INFINITY
;
const
uint64_t
idst
=
h
*
(
n_kv
*
n_tokens
)
+
i1
*
n_kv
;
for
(
int
s
=
0
;
s
<
ubatch
->
n_seq_id
[
i0
]
;
++
s
)
{
for
(
int
i0
=
0
;
i0
<
n_tokens
;
++
i0
)
{
const
llama_seq_id
s0
=
ubatch
->
seq_id
[
i0
][
0
];
const
llama_pos
p0
=
ubatch
->
pos
[
i0
];
// mask different sequences
if
(
s0
!=
s1
)
{
continue
;
// skip different sequences
continue
;
}
if
(
cparams
.
causal_attn
&&
ubatch
->
pos
[
i0
]
>
ubatch
->
pos
[
i1
])
{
continue
;
// skip future tokens for causal attention
// mask future tokens
if
(
cparams
.
causal_attn
&&
p0
>
p1
)
{
continue
;
}
// TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
//if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
// continue; // skip masked tokens for SWA
//}
// TODO: reimplement this like in llama_kv_cache_unified
if
(
hparams
.
use_alibi
)
{
f
=
-
std
::
abs
(
ubatch
->
pos
[
i0
]
-
ubatch
->
pos
[
i1
]);
}
else
{
f
=
0.0
f
;
// apply SWA if any
if
(
llama_hparams
::
is_masked_swa
(
n_swa
,
swa_type
,
p0
,
p1
))
{
continue
;
}
data
[
idst
+
i0
]
=
hparams
.
use_alibi
?
-
std
::
abs
(
p0
-
p1
)
:
0.0
f
;
}
data
[
h
*
(
n_kv
*
n_tokens
)
+
i1
*
n_kv
+
i0
]
=
f
;
}
}
};
{
GGML_ASSERT
(
self_kq_mask
);
GGML_ASSERT
(
ggml_backend_buffer_is_host
(
self_kq_mask
->
buffer
));
float
*
data
=
(
float
*
)
self_kq_mask
->
data
;
std
::
fill
(
data
,
data
+
ggml_nelements
(
self_kq_mask
),
-
INFINITY
);
fill_mask
(
data
,
0
,
LLAMA_SWA_TYPE_NONE
);
if
(
debug
)
{
print_mask
(
data
,
n_tokens
,
n_kv
,
0
,
LLAMA_SWA_TYPE_NONE
);
}
}
if
(
debug
)
{
print_mask
(
data
,
n_tokens
,
n_kv
,
hparams
.
n_swa
,
hparams
.
swa_type
);
if
(
hparams
.
swa_type
!=
LLAMA_SWA_TYPE_NONE
)
{
GGML_ASSERT
(
self_kq_mask_swa
);
GGML_ASSERT
(
ggml_backend_buffer_is_host
(
self_kq_mask_swa
->
buffer
));
float
*
data
=
(
float
*
)
self_kq_mask_swa
->
data
;
std
::
fill
(
data
,
data
+
ggml_nelements
(
self_kq_mask_swa
),
-
INFINITY
);
fill_mask
(
data
,
hparams
.
n_swa
,
hparams
.
swa_type
);
if
(
debug
)
{
print_mask
(
data
,
n_tokens
,
n_kv
,
hparams
.
n_swa
,
hparams
.
swa_type
);
}
}
}
...
...
@@ -928,6 +950,31 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb
(
selection_probs
,
"ffn_moe_probs_biased"
,
il
);
}
// select top n_group_used expert groups
// https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/e815299b0bcbac849fa540c768ef21845365c9eb/modeling_deepseek.py#L440-L457
if
(
hparams
.
n_expert_groups
>
1
&&
n_tokens
>
0
)
{
const
int64_t
n_exp_per_group
=
n_expert
/
hparams
.
n_expert_groups
;
// organize experts into n_expert_groups
ggml_tensor
*
selection_groups
=
ggml_reshape_3d
(
ctx0
,
selection_probs
,
n_exp_per_group
,
hparams
.
n_expert_groups
,
n_tokens
);
// [n_exp_per_group, n_expert_groups, n_tokens]
ggml_tensor
*
group_scores
=
ggml_top_k
(
ctx0
,
selection_groups
,
2
);
// [2, n_expert_groups, n_tokens]
group_scores
=
ggml_get_rows
(
ctx0
,
ggml_reshape_4d
(
ctx0
,
selection_groups
,
1
,
selection_groups
->
ne
[
0
],
selection_groups
->
ne
[
1
],
selection_groups
->
ne
[
2
]),
group_scores
);
// [1, 2, n_expert_groups, n_tokens]
// get top n_group_used expert groups
group_scores
=
ggml_sum_rows
(
ctx0
,
ggml_reshape_3d
(
ctx0
,
group_scores
,
group_scores
->
ne
[
1
],
group_scores
->
ne
[
2
],
group_scores
->
ne
[
3
]));
// [1, n_expert_groups, n_tokens]
group_scores
=
ggml_reshape_2d
(
ctx0
,
group_scores
,
group_scores
->
ne
[
1
],
group_scores
->
ne
[
2
]);
// [n_expert_groups, n_tokens]
ggml_tensor
*
expert_groups
=
ggml_top_k
(
ctx0
,
group_scores
,
hparams
.
n_group_used
);
// [n_group_used, n_tokens]
cb
(
expert_groups
,
"ffn_moe_group_topk"
,
il
);
// mask out the other groups
selection_probs
=
ggml_get_rows
(
ctx0
,
selection_groups
,
expert_groups
);
// [n_exp_per_group, n_group_used, n_tokens]
selection_probs
=
ggml_set_rows
(
ctx0
,
ggml_scale_bias
(
ctx0
,
selection_groups
,
0.0
f
,
-
INFINITY
),
selection_probs
,
expert_groups
);
// [n_exp_per_group, n_expert_groups, n_tokens]
selection_probs
=
ggml_reshape_2d
(
ctx0
,
selection_probs
,
n_expert
,
n_tokens
);
// [n_expert, n_tokens]
cb
(
selection_probs
,
"ffn_moe_probs_masked"
,
il
);
}
// select experts
ggml_tensor
*
selected_experts
=
ggml_top_k
(
ctx0
,
selection_probs
,
n_expert_used
);
// [n_expert_used, n_tokens]
cb
(
selected_experts
->
src
[
0
],
"ffn_moe_argsort"
,
il
);
...
...
@@ -959,6 +1006,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor
*
weights_sum
=
ggml_sum_rows
(
ctx0
,
weights
);
// [1, n_tokens]
cb
(
weights_sum
,
"ffn_moe_weights_sum"
,
il
);
if
(
arch
==
LLM_ARCH_BAILINGMOE2
)
{
weights_sum
=
ggml_scale_bias
(
ctx0
,
weights_sum
,
1.0
,
1e-20
);
cb
(
weights_sum
,
"ffn_moe_weights_sum_biased"
,
il
);
}
weights
=
ggml_div
(
ctx0
,
weights
,
weights_sum
);
// [n_expert_used, n_tokens]
cb
(
weights
,
"ffn_moe_weights_norm"
,
il
);
...
...
@@ -1299,12 +1351,9 @@ ggml_tensor * llm_graph_context::build_attn_mha(
k
=
ggml_permute
(
ctx0
,
k
,
0
,
2
,
1
,
3
);
v
=
ggml_permute
(
ctx0
,
v
,
0
,
2
,
1
,
3
);
const
auto
n_kv
=
k
->
ne
[
1
];
ggml_tensor
*
cur
;
// TODO: replace hardcoded padding with ggml-provided padding
if
(
cparams
.
flash_attn
&&
(
n_kv
%
256
==
0
)
&&
kq_b
==
nullptr
)
{
if
(
cparams
.
flash_attn
&&
kq_b
==
nullptr
)
{
GGML_ASSERT
(
kq_b
==
nullptr
&&
"Flash attention does not support KQ bias yet"
);
if
(
v_trans
)
{
...
...
@@ -1419,10 +1468,20 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
auto
inp
=
std
::
make_unique
<
llm_graph_input_attn_no_cache
>
(
hparams
,
cparams
);
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
inp
->
kq_mask
=
ggml_new_tensor_4d
(
ctx0
,
GGML_TYPE_F32
,
n_tokens
,
GGML_PAD
(
n_tokens
,
GGML_KQ_MASK_PAD
),
1
,
1
);
ggml_set_input
(
inp
->
kq_mask
);
inp
->
self_kq_mask
=
ggml_new_tensor_4d
(
ctx0
,
GGML_TYPE_F32
,
n_tokens
,
GGML_PAD
(
n_tokens
,
GGML_KQ_MASK_PAD
),
1
,
1
);
ggml_set_input
(
inp
->
self_kq_mask
);
inp
->
self_kq_mask_cnv
=
cparams
.
flash_attn
?
ggml_cast
(
ctx0
,
inp
->
self_kq_mask
,
GGML_TYPE_F16
)
:
inp
->
self_kq_mask
;
inp
->
kq_mask_cnv
=
cparams
.
flash_attn
?
ggml_cast
(
ctx0
,
inp
->
kq_mask
,
GGML_TYPE_F16
)
:
inp
->
kq_mask
;
if
(
hparams
.
swa_type
!=
LLAMA_SWA_TYPE_NONE
)
{
inp
->
self_kq_mask_swa
=
ggml_new_tensor_4d
(
ctx0
,
GGML_TYPE_F32
,
n_tokens
,
GGML_PAD
(
n_tokens
,
GGML_KQ_MASK_PAD
),
1
,
1
);
ggml_set_input
(
inp
->
self_kq_mask_swa
);
inp
->
self_kq_mask_swa_cnv
=
cparams
.
flash_attn
?
ggml_cast
(
ctx0
,
inp
->
self_kq_mask_swa
,
GGML_TYPE_F16
)
:
inp
->
self_kq_mask_swa
;
}
else
{
inp
->
self_kq_mask_swa
=
nullptr
;
inp
->
self_kq_mask_swa_cnv
=
nullptr
;
}
return
(
llm_graph_input_attn_no_cache
*
)
res
->
add_input
(
std
::
move
(
inp
));
}
...
...
@@ -1447,7 +1506,9 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_build_forward_expand
(
gf
,
k_cur
);
ggml_build_forward_expand
(
gf
,
v_cur
);
const
auto
&
kq_mask
=
inp
->
get_kq_mask
();
const
bool
is_swa
=
hparams
.
is_swa
(
il
);
const
auto
&
kq_mask
=
is_swa
?
inp
->
get_kq_mask_swa
()
:
inp
->
get_kq_mask
();
// [TAG_NO_CACHE_PAD]
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
...
...
llama/llama.cpp/src/llama-graph.h
View file @
544b6739
...
...
@@ -257,10 +257,14 @@ public:
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
get_kq_mask
()
const
{
return
kq_mask_cnv
;
}
ggml_tensor
*
get_kq_mask
()
const
{
return
self_kq_mask_cnv
;
}
ggml_tensor
*
get_kq_mask_swa
()
const
{
return
self_kq_mask_swa_cnv
;
}
ggml_tensor
*
kq_mask
=
nullptr
;
// F32 [n_tokens, n_batch, 1, 1]
ggml_tensor
*
kq_mask_cnv
=
nullptr
;
// [n_tokens, n_batch, 1, 1]
// n_tokens == n_batch
ggml_tensor
*
self_kq_mask
=
nullptr
;
// F32 [n_tokens, n_batch/n_stream, 1, n_stream]
ggml_tensor
*
self_kq_mask_cnv
=
nullptr
;
// [n_tokens, n_batch/n_stream, 1, n_stream]
ggml_tensor
*
self_kq_mask_swa
=
nullptr
;
// F32 [n_tokens, n_batch/n_stream, 1, n_stream]
ggml_tensor
*
self_kq_mask_swa_cnv
=
nullptr
;
// [n_tokens, n_batch/n_stream, 1, n_stream]
const
llama_hparams
hparams
;
const
llama_cparams
cparams
;
...
...
llama/llama.cpp/src/llama-hparams.h
View file @
544b6739
...
...
@@ -74,6 +74,8 @@ struct llama_hparams {
uint32_t
n_ff_chexp
=
0
;
uint32_t
n_expert_shared
=
0
;
uint32_t
n_norm_groups
=
0
;
uint32_t
n_expert_groups
=
0
;
uint32_t
n_group_used
=
0
;
uint32_t
n_group_experts
=
0
;
float
expert_group_scale
=
0
.
05
f
;
...
...
llama/llama.cpp/src/llama-model.cpp
View file @
544b6739
This diff is collapsed.
Click to expand it.
llama/llama.cpp/src/llama-model.h
View file @
544b6739
...
...
@@ -108,9 +108,12 @@ enum llm_type {
LLM_TYPE_17B_16E
,
// llama4 Scout
LLM_TYPE_17B_128E
,
// llama4 Maverick
LLM_TYPE_A13B
,
LLM_TYPE_7B_A1B
,
LLM_TYPE_8B_A1B
,
// lfm2moe
LLM_TYPE_16B_A1B
,
LLM_TYPE_21B_A3B
,
// Ernie MoE small
LLM_TYPE_30B_A3B
,
LLM_TYPE_100B_A6B
,
LLM_TYPE_106B_A12B
,
// GLM-4.5-Air
LLM_TYPE_235B_A22B
,
LLM_TYPE_300B_A47B
,
// Ernie MoE big
...
...
llama/llama.cpp/src/llama-quant.cpp
View file @
544b6739
...
...
@@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
});
}
bool
is_clip_model
=
false
;
for
(
const
auto
*
it
:
tensors
)
{
const
struct
ggml_tensor
*
tensor
=
it
->
tensor
;
...
...
@@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
}
else
if
(
name
==
LLM_TN
(
model
.
arch
)(
LLM_TENSOR_OUTPUT
,
"weight"
))
{
qs
.
has_output
=
true
;
}
is_clip_model
|=
name
.
rfind
(
"mm."
,
0
)
==
0
;
// check the "mm." prefix
}
qs
.
n_ffn_down
=
qs
.
n_ffn_gate
=
qs
.
n_ffn_up
=
(
int
)
model
.
hparams
.
n_layer
;
// sanity checks for models that have attention layers
if
(
qs
.
n_attention_wv
!=
0
)
if
(
qs
.
n_attention_wv
!=
0
&&
!
is_clip_model
)
{
const
auto
&
n_head_kv_iter
=
model
.
hparams
.
n_head_kv_arr
.
begin
();
// attention layers have a non-zero number of kv heads
...
...
@@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// do not quantize relative position bias (T5)
quantize
&=
name
.
find
(
"attn_rel_b.weight"
)
==
std
::
string
::
npos
;
// do not quantize specific multimodal tensors
quantize
&=
name
.
find
(
".position_embd."
)
==
std
::
string
::
npos
;
ggml_type
new_type
;
void
*
new_data
;
size_t
new_size
;
...
...
llama/llama.cpp/src/llama-vocab.cpp
View file @
544b6739
...
...
@@ -1957,6 +1957,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
clean_spaces
=
false
;
}
else
if
(
tokenizer_pre
==
"bailingmoe"
||
tokenizer_pre
==
"bailingmoe2"
||
tokenizer_pre
==
"llada-moe"
)
{
pre_type
=
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE
;
clean_spaces
=
false
;
...
...
llama/llama.cpp/src/llama.cpp
View file @
544b6739
...
...
@@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
}
catch
(
const
std
::
exception
&
e
)
{
throw
std
::
runtime_error
(
"error loading model hyperparameters: "
+
std
::
string
(
e
.
what
()));
}
if
(
model
.
arch
==
LLM_ARCH_CLIP
)
{
throw
std
::
runtime_error
(
"CLIP cannot be used as main model, use it with --mmproj instead"
);
}
try
{
model
.
load_vocab
(
ml
);
}
catch
(
const
std
::
exception
&
e
)
{
...
...
@@ -314,6 +317,7 @@ struct llama_model * llama_model_load_from_splits(
LLAMA_LOG_ERROR
(
"%s: list of splits is empty
\n
"
,
__func__
);
return
nullptr
;
}
splits
.
reserve
(
n_paths
);
for
(
size_t
i
=
0
;
i
<
n_paths
;
++
i
)
{
splits
.
push_back
(
paths
[
i
]);
}
...
...
llama/llama.cpp/tools/mtmd/clip-impl.h
View file @
544b6739
...
...
@@ -30,6 +30,7 @@
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
// vision-specific
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
#define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
...
...
@@ -48,6 +49,7 @@
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
// audio-specific
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
...
...
llama/llama.cpp/tools/mtmd/clip.cpp
View file @
544b6739
...
...
@@ -2234,15 +2234,27 @@ struct clip_model_loader {
// projector type
std
::
string
proj_type
;
{
// default key
get_string
(
KEY_PROJ_TYPE
,
proj_type
,
false
);
if
(
!
proj_type
.
empty
())
{
model
.
proj_type
=
clip_projector_type_from_string
(
proj_type
);
// for models with mixed modalities
if
(
proj_type
.
empty
())
{
if
(
modality
==
CLIP_MODALITY_VISION
)
{
get_string
(
KEY_VISION_PROJ_TYPE
,
proj_type
,
false
);
}
else
if
(
modality
==
CLIP_MODALITY_AUDIO
)
{
get_string
(
KEY_AUDIO_PROJ_TYPE
,
proj_type
,
false
);
}
else
{
GGML_ABORT
(
"unknown modality"
);
}
}
model
.
proj_type
=
clip_projector_type_from_string
(
proj_type
);
if
(
model
.
proj_type
==
PROJECTOR_TYPE_UNKNOWN
)
{
throw
std
::
runtime_error
(
string_format
(
"%s: unknown projector type: %s
\n
"
,
__func__
,
proj_type
.
c_str
()));
}
// correct arch for multimodal models
// correct arch for multimodal models
(legacy method)
if
(
model
.
proj_type
==
PROJECTOR_TYPE_QWEN25O
)
{
model
.
proj_type
=
modality
==
CLIP_MODALITY_VISION
?
PROJECTOR_TYPE_QWEN25VL
...
...
llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
View file @
544b6739
...
...
@@ -23,7 +23,7 @@ problem.
8 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index ff9135fe..8ba86f82 100644
index ff9135fe
2
..8ba86f82
4
100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -113,7 +113,6 @@
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
...
...
@@ -64,18 +64,18 @@ index ff9135fe..8ba86f82 100644
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index
ad1adba6..7d44f74f
100
755
index
8bd5449f1..01e2df61a
100
644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -8
43
,6 +8
43
,7 @@
static
void
ggml_backend_
cann
_buffer_
free_
buffer
(
ggml_backend_cann_buffer_
context* ctx =
(ggml_backend_cann_buffer_context*)buffer->context;
@@ -8
20
,6 +8
20
,7 @@
static
bool
ggml_backend_
buffer_is_cann(ggml_backend
_buffer_
t
buffer
) {
static void
ggml_backend_cann_buffer_
free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cann_buffer_context * ctx =
(ggml_backend_cann_buffer_context
*)
buffer->context;
delete ctx;
+ delete buffer;
}
/**
@@ -16
3
0,6 +16
3
1,7 @@
static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
@@ -1
5
60,6 +1
5
61,7 @@
static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
*/
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
ACL_CHECK(aclrtFreeHost(buffer->context));
...
...
@@ -84,10 +84,10 @@ index ad1adba6..7d44f74f 100755
/**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
856e9de2..c0b1e4c1
100644
index
bc396b521..aefc6935e
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -5
6
7,6 +5
6
7,7 @@
struct ggml_backend_cuda_buffer_context {
@@ -57
6
,6 +57
6
,7 @@
struct ggml_backend_cuda_buffer_context {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
delete ctx;
...
...
@@ -95,7 +95,7 @@ index 856e9de2..c0b1e4c1 100644
}
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
@@ -8
22
,6 +8
2
3,7 @@
struct ggml_backend_cuda_split_buffer_context {
@@ -8
31
,6 +83
2
,7 @@
struct ggml_backend_cuda_split_buffer_context {
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
delete ctx;
...
...
@@ -103,7 +103,7 @@ index 856e9de2..c0b1e4c1 100644
}
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -11
03
,6 +11
05
,7 @@
static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
@@ -11
12
,6 +11
14
,7 @@
static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
CUDA_CHECK(cudaFreeHost(buffer->context));
...
...
@@ -112,7 +112,7 @@ index 856e9de2..c0b1e4c1 100644
static void * ggml_cuda_host_malloc(size_t size) {
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 7afc881f..bf096227 100644
index 7afc881f
a
..bf096227
4
100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -25,6 +25,7 @@
static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
...
...
@@ -132,10 +132,10 @@ index 7afc881f..bf096227 100644
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index
79d21487..38c75018
100644
index
db33a4ab6..c42ee26e1
100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -32
12
,6 +32
12
,7 @@
struct ggml_backend_opencl_buffer_context {
@@ -32
66
,6 +32
66
,7 @@
struct ggml_backend_opencl_buffer_context {
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
delete ctx;
...
...
@@ -144,7 +144,7 @@ index 79d21487..38c75018 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index a
ad48d62..a46c0f52
100644
index a
38df5a97..fd07e4a21
100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -528,6 +528,7 @@
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
...
...
@@ -156,10 +156,10 @@ index aad48d62..a46c0f52 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index
45b8c216..4ec9a592
100644
index
b695ba051..37e853120
100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3
34
,6 +3
34
,7 @@
ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -3
52
,6 +3
52
,7 @@
ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
ggml_sycl_set_device(ctx->device);
delete ctx;
...
...
@@ -167,7 +167,7 @@ index 45b8c216..4ec9a592 100644
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -
795
,6 +
796
,7 @@
struct ggml_backend_sycl_split_buffer_context {
@@ -
813
,6 +
814
,7 @@
struct ggml_backend_sycl_split_buffer_context {
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
delete ctx;
...
...
@@ -175,7 +175,7 @@ index 45b8c216..4ec9a592 100644
}
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -11
37
,6 +11
39
,7 @@
static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
@@ -11
55
,6 +11
57
,7 @@
static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_sycl_host_free(buffer->context);
...
...
@@ -184,10 +184,10 @@ index 45b8c216..4ec9a592 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index
3cd89c71..ed83236f
100644
index
b783f7805..216dc167c
100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -11
600
,6 +11
600
,7 @@
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -11
828
,6 +11
828
,7 @@
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx;
...
...
@@ -195,7 +195,7 @@ index 3cd89c71..ed83236f 100644
}
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -11
743
,6 +11
744
,7 @@
static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
@@ -11
971
,6 +11
972
,7 @@
static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment