Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
544b6739
Unverified
Commit
544b6739
authored
Nov 06, 2025
by
Daniel Hiltgen
Committed by
GitHub
Nov 06, 2025
Browse files
ggml update to b6840 (#12791)
parent
c4ba257c
Changes
103
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
585 additions
and
135 deletions
+585
-135
Makefile.sync
Makefile.sync
+1
-1
llama/build-info.cpp
llama/build-info.cpp
+1
-1
llama/llama.cpp/common/json-schema-to-grammar.cpp
llama/llama.cpp/common/json-schema-to-grammar.cpp
+12
-12
llama/llama.cpp/src/llama-arch.cpp
llama/llama.cpp/src/llama-arch.cpp
+40
-0
llama/llama.cpp/src/llama-arch.h
llama/llama.cpp/src/llama-arch.h
+4
-0
llama/llama.cpp/src/llama-batch.h
llama/llama.cpp/src/llama-batch.h
+1
-1
llama/llama.cpp/src/llama-chat.cpp
llama/llama.cpp/src/llama-chat.cpp
+35
-2
llama/llama.cpp/src/llama-chat.h
llama/llama.cpp/src/llama-chat.h
+2
-0
llama/llama.cpp/src/llama-context.cpp
llama/llama.cpp/src/llama-context.cpp
+2
-1
llama/llama.cpp/src/llama-graph.cpp
llama/llama.cpp/src/llama-graph.cpp
+104
-43
llama/llama.cpp/src/llama-graph.h
llama/llama.cpp/src/llama-graph.h
+7
-3
llama/llama.cpp/src/llama-hparams.h
llama/llama.cpp/src/llama-hparams.h
+2
-0
llama/llama.cpp/src/llama-model.cpp
llama/llama.cpp/src/llama-model.cpp
+321
-46
llama/llama.cpp/src/llama-model.h
llama/llama.cpp/src/llama-model.h
+3
-0
llama/llama.cpp/src/llama-quant.cpp
llama/llama.cpp/src/llama-quant.cpp
+7
-1
llama/llama.cpp/src/llama-vocab.cpp
llama/llama.cpp/src/llama-vocab.cpp
+1
-0
llama/llama.cpp/src/llama.cpp
llama/llama.cpp/src/llama.cpp
+4
-0
llama/llama.cpp/tools/mtmd/clip-impl.h
llama/llama.cpp/tools/mtmd/clip-impl.h
+2
-0
llama/llama.cpp/tools/mtmd/clip.cpp
llama/llama.cpp/tools/mtmd/clip.cpp
+15
-3
llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
...gml-backend-malloc-and-free-using-the-same-compiler.patch
+21
-21
No files found.
Makefile.sync
View file @
544b6739
UPSTREAM
=
https://github.com/ggml-org/llama.cpp.git
UPSTREAM
=
https://github.com/ggml-org/llama.cpp.git
WORKDIR
=
llama/vendor
WORKDIR
=
llama/vendor
FETCH_HEAD
=
7049736b2dd9011bf819e298b844ebbc4b5af
dc
9
FETCH_HEAD
=
3cfa9c3f125763305b4226bc032f1954f08990
dc
.PHONY
:
help
.PHONY
:
help
help
:
help
:
...
...
llama/build-info.cpp
View file @
544b6739
int
LLAMA_BUILD_NUMBER
=
0
;
int
LLAMA_BUILD_NUMBER
=
0
;
char
const
*
LLAMA_COMMIT
=
"
7049736b2dd9011bf819e298b844ebbc4b5af
dc
9
"
;
char
const
*
LLAMA_COMMIT
=
"
3cfa9c3f125763305b4226bc032f1954f08990
dc"
;
char
const
*
LLAMA_COMPILER
=
""
;
char
const
*
LLAMA_COMPILER
=
""
;
char
const
*
LLAMA_BUILD_TARGET
=
""
;
char
const
*
LLAMA_BUILD_TARGET
=
""
;
llama/llama.cpp/common/json-schema-to-grammar.cpp
View file @
544b6739
...
@@ -41,9 +41,9 @@ static std::string build_repetition(const std::string & item_rule, int min_items
...
@@ -41,9 +41,9 @@ static std::string build_repetition(const std::string & item_rule, int min_items
return
result
;
return
result
;
}
}
static
void
_build_min_max_int
(
int
min_value
,
int
max_value
,
std
::
stringstream
&
out
,
int
decimals_left
=
16
,
bool
top_level
=
true
)
{
static
void
_build_min_max_int
(
int
64_t
min_value
,
int
64_t
max_value
,
std
::
stringstream
&
out
,
int
decimals_left
=
16
,
bool
top_level
=
true
)
{
auto
has_min
=
min_value
!=
std
::
numeric_limits
<
int
>::
min
();
auto
has_min
=
min_value
!=
std
::
numeric_limits
<
int
64_t
>::
min
();
auto
has_max
=
max_value
!=
std
::
numeric_limits
<
int
>::
max
();
auto
has_max
=
max_value
!=
std
::
numeric_limits
<
int
64_t
>::
max
();
auto
digit_range
=
[
&
](
char
from
,
char
to
)
{
auto
digit_range
=
[
&
](
char
from
,
char
to
)
{
out
<<
"["
;
out
<<
"["
;
...
@@ -159,7 +159,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
...
@@ -159,7 +159,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
if
(
has_min
)
{
if
(
has_min
)
{
if
(
min_value
<
0
)
{
if
(
min_value
<
0
)
{
out
<<
"
\"
-
\"
("
;
out
<<
"
\"
-
\"
("
;
_build_min_max_int
(
std
::
numeric_limits
<
int
>::
min
(),
-
min_value
,
out
,
decimals_left
,
/* top_level= */
false
);
_build_min_max_int
(
std
::
numeric_limits
<
int
64_t
>::
min
(),
-
min_value
,
out
,
decimals_left
,
/* top_level= */
false
);
out
<<
") | [0] | [1-9] "
;
out
<<
") | [0] | [1-9] "
;
more_digits
(
0
,
decimals_left
-
1
);
more_digits
(
0
,
decimals_left
-
1
);
}
else
if
(
min_value
==
0
)
{
}
else
if
(
min_value
==
0
)
{
...
@@ -194,7 +194,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
...
@@ -194,7 +194,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
}
}
digit_range
(
c
,
c
);
digit_range
(
c
,
c
);
out
<<
" ("
;
out
<<
" ("
;
_build_min_max_int
(
std
::
sto
i
(
min_s
.
substr
(
1
)),
std
::
numeric_limits
<
int
>::
max
(),
out
,
less_decimals
,
/* top_level= */
false
);
_build_min_max_int
(
std
::
sto
ll
(
min_s
.
substr
(
1
)),
std
::
numeric_limits
<
int
64_t
>::
max
(),
out
,
less_decimals
,
/* top_level= */
false
);
out
<<
")"
;
out
<<
")"
;
if
(
c
<
'9'
)
{
if
(
c
<
'9'
)
{
out
<<
" | "
;
out
<<
" | "
;
...
@@ -216,7 +216,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
...
@@ -216,7 +216,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
_build_min_max_int
(
0
,
max_value
,
out
,
decimals_left
,
/* top_level= */
true
);
_build_min_max_int
(
0
,
max_value
,
out
,
decimals_left
,
/* top_level= */
true
);
}
else
{
}
else
{
out
<<
"
\"
-
\"
("
;
out
<<
"
\"
-
\"
("
;
_build_min_max_int
(
-
max_value
,
std
::
numeric_limits
<
int
>::
max
(),
out
,
decimals_left
,
/* top_level= */
false
);
_build_min_max_int
(
-
max_value
,
std
::
numeric_limits
<
int
64_t
>::
max
(),
out
,
decimals_left
,
/* top_level= */
false
);
out
<<
")"
;
out
<<
")"
;
}
}
return
;
return
;
...
@@ -925,17 +925,17 @@ public:
...
@@ -925,17 +925,17 @@ public:
int
max_len
=
schema
.
contains
(
"maxLength"
)
?
schema
[
"maxLength"
].
get
<
int
>
()
:
std
::
numeric_limits
<
int
>::
max
();
int
max_len
=
schema
.
contains
(
"maxLength"
)
?
schema
[
"maxLength"
].
get
<
int
>
()
:
std
::
numeric_limits
<
int
>::
max
();
return
_add_rule
(
rule_name
,
"
\"\\\"\"
"
+
build_repetition
(
char_rule
,
min_len
,
max_len
)
+
"
\"\\\"\"
space"
);
return
_add_rule
(
rule_name
,
"
\"\\\"\"
"
+
build_repetition
(
char_rule
,
min_len
,
max_len
)
+
"
\"\\\"\"
space"
);
}
else
if
(
schema_type
==
"integer"
&&
(
schema
.
contains
(
"minimum"
)
||
schema
.
contains
(
"exclusiveMinimum"
)
||
schema
.
contains
(
"maximum"
)
||
schema
.
contains
(
"exclusiveMaximum"
)))
{
}
else
if
(
schema_type
==
"integer"
&&
(
schema
.
contains
(
"minimum"
)
||
schema
.
contains
(
"exclusiveMinimum"
)
||
schema
.
contains
(
"maximum"
)
||
schema
.
contains
(
"exclusiveMaximum"
)))
{
int
min_value
=
std
::
numeric_limits
<
int
>::
min
();
int
64_t
min_value
=
std
::
numeric_limits
<
int
64_t
>::
min
();
int
max_value
=
std
::
numeric_limits
<
int
>::
max
();
int
64_t
max_value
=
std
::
numeric_limits
<
int
64_t
>::
max
();
if
(
schema
.
contains
(
"minimum"
))
{
if
(
schema
.
contains
(
"minimum"
))
{
min_value
=
schema
[
"minimum"
].
get
<
int
>
();
min_value
=
schema
[
"minimum"
].
get
<
int
64_t
>
();
}
else
if
(
schema
.
contains
(
"exclusiveMinimum"
))
{
}
else
if
(
schema
.
contains
(
"exclusiveMinimum"
))
{
min_value
=
schema
[
"exclusiveMinimum"
].
get
<
int
>
()
+
1
;
min_value
=
schema
[
"exclusiveMinimum"
].
get
<
int
64_t
>
()
+
1
;
}
}
if
(
schema
.
contains
(
"maximum"
))
{
if
(
schema
.
contains
(
"maximum"
))
{
max_value
=
schema
[
"maximum"
].
get
<
int
>
();
max_value
=
schema
[
"maximum"
].
get
<
int
64_t
>
();
}
else
if
(
schema
.
contains
(
"exclusiveMaximum"
))
{
}
else
if
(
schema
.
contains
(
"exclusiveMaximum"
))
{
max_value
=
schema
[
"exclusiveMaximum"
].
get
<
int
>
()
-
1
;
max_value
=
schema
[
"exclusiveMaximum"
].
get
<
int
64_t
>
()
-
1
;
}
}
std
::
stringstream
out
;
std
::
stringstream
out
;
out
<<
"("
;
out
<<
"("
;
...
...
llama/llama.cpp/src/llama-arch.cpp
View file @
544b6739
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
#include <map>
#include <map>
static
const
std
::
map
<
llm_arch
,
const
char
*>
LLM_ARCH_NAMES
=
{
static
const
std
::
map
<
llm_arch
,
const
char
*>
LLM_ARCH_NAMES
=
{
{
LLM_ARCH_CLIP
,
"clip"
},
// dummy, only used by llama-quantize
{
LLM_ARCH_LLAMA
,
"llama"
},
{
LLM_ARCH_LLAMA
,
"llama"
},
{
LLM_ARCH_LLAMA4
,
"llama4"
},
{
LLM_ARCH_LLAMA4
,
"llama4"
},
{
LLM_ARCH_DECI
,
"deci"
},
{
LLM_ARCH_DECI
,
"deci"
},
...
@@ -85,6 +86,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
...
@@ -85,6 +86,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{
LLM_ARCH_WAVTOKENIZER_DEC
,
"wavtokenizer-dec"
},
{
LLM_ARCH_WAVTOKENIZER_DEC
,
"wavtokenizer-dec"
},
{
LLM_ARCH_PLM
,
"plm"
},
{
LLM_ARCH_PLM
,
"plm"
},
{
LLM_ARCH_BAILINGMOE
,
"bailingmoe"
},
{
LLM_ARCH_BAILINGMOE
,
"bailingmoe"
},
{
LLM_ARCH_BAILINGMOE2
,
"bailingmoe2"
},
{
LLM_ARCH_DOTS1
,
"dots1"
},
{
LLM_ARCH_DOTS1
,
"dots1"
},
{
LLM_ARCH_ARCEE
,
"arcee"
},
{
LLM_ARCH_ARCEE
,
"arcee"
},
{
LLM_ARCH_ERNIE4_5
,
"ernie4_5"
},
{
LLM_ARCH_ERNIE4_5
,
"ernie4_5"
},
...
@@ -135,6 +137,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
...
@@ -135,6 +137,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{
LLM_KV_EXPERT_COUNT
,
"%s.expert_count"
},
{
LLM_KV_EXPERT_COUNT
,
"%s.expert_count"
},
{
LLM_KV_EXPERT_USED_COUNT
,
"%s.expert_used_count"
},
{
LLM_KV_EXPERT_USED_COUNT
,
"%s.expert_used_count"
},
{
LLM_KV_EXPERT_SHARED_COUNT
,
"%s.expert_shared_count"
},
{
LLM_KV_EXPERT_SHARED_COUNT
,
"%s.expert_shared_count"
},
{
LLM_KV_EXPERT_GROUP_COUNT
,
"%s.expert_group_count"
},
{
LLM_KV_EXPERT_GROUP_USED_COUNT
,
"%s.expert_group_used_count"
},
{
LLM_KV_EXPERT_WEIGHTS_SCALE
,
"%s.expert_weights_scale"
},
{
LLM_KV_EXPERT_WEIGHTS_SCALE
,
"%s.expert_weights_scale"
},
{
LLM_KV_EXPERT_WEIGHTS_NORM
,
"%s.expert_weights_norm"
},
{
LLM_KV_EXPERT_WEIGHTS_NORM
,
"%s.expert_weights_norm"
},
{
LLM_KV_EXPERT_GATING_FUNC
,
"%s.expert_gating_func"
},
{
LLM_KV_EXPERT_GATING_FUNC
,
"%s.expert_gating_func"
},
...
@@ -277,6 +281,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
...
@@ -277,6 +281,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
};
};
static
const
std
::
map
<
llm_arch
,
std
::
map
<
llm_tensor
,
const
char
*>>
LLM_TENSOR_NAMES
=
{
static
const
std
::
map
<
llm_arch
,
std
::
map
<
llm_tensor
,
const
char
*>>
LLM_TENSOR_NAMES
=
{
{
LLM_ARCH_CLIP
,
{},
},
{
{
LLM_ARCH_LLAMA
,
LLM_ARCH_LLAMA
,
{
{
...
@@ -1961,6 +1969,38 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
...
@@ -1961,6 +1969,38 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{
LLM_TENSOR_FFN_UP_SHEXP
,
"blk.%d.ffn_up_shexp"
},
{
LLM_TENSOR_FFN_UP_SHEXP
,
"blk.%d.ffn_up_shexp"
},
},
},
},
},
{
LLM_ARCH_BAILINGMOE2
,
{
{
LLM_TENSOR_TOKEN_EMBD
,
"token_embd"
},
{
LLM_TENSOR_OUTPUT_NORM
,
"output_norm"
},
{
LLM_TENSOR_OUTPUT
,
"output"
},
{
LLM_TENSOR_ATTN_NORM
,
"blk.%d.attn_norm"
},
{
LLM_TENSOR_ATTN_Q_NORM
,
"blk.%d.attn_q_norm"
},
{
LLM_TENSOR_ATTN_K_NORM
,
"blk.%d.attn_k_norm"
},
{
LLM_TENSOR_ATTN_QKV
,
"blk.%d.attn_qkv"
},
{
LLM_TENSOR_ATTN_OUT
,
"blk.%d.attn_output"
},
{
LLM_TENSOR_FFN_GATE_INP
,
"blk.%d.ffn_gate_inp"
},
{
LLM_TENSOR_FFN_EXP_PROBS_B
,
"blk.%d.exp_probs_b"
},
{
LLM_TENSOR_FFN_NORM
,
"blk.%d.ffn_norm"
},
{
LLM_TENSOR_FFN_GATE
,
"blk.%d.ffn_gate"
},
{
LLM_TENSOR_FFN_DOWN
,
"blk.%d.ffn_down"
},
{
LLM_TENSOR_FFN_UP
,
"blk.%d.ffn_up"
},
{
LLM_TENSOR_FFN_GATE_EXPS
,
"blk.%d.ffn_gate_exps"
},
{
LLM_TENSOR_FFN_DOWN_EXPS
,
"blk.%d.ffn_down_exps"
},
{
LLM_TENSOR_FFN_UP_EXPS
,
"blk.%d.ffn_up_exps"
},
{
LLM_TENSOR_FFN_GATE_SHEXP
,
"blk.%d.ffn_gate_shexp"
},
{
LLM_TENSOR_FFN_DOWN_SHEXP
,
"blk.%d.ffn_down_shexp"
},
{
LLM_TENSOR_FFN_UP_SHEXP
,
"blk.%d.ffn_up_shexp"
},
{
LLM_TENSOR_NEXTN_EH_PROJ
,
"blk.%d.nextn.eh_proj"
},
{
LLM_TENSOR_NEXTN_EMBED_TOKENS
,
"blk.%d.nextn.embed_tokens"
},
{
LLM_TENSOR_NEXTN_ENORM
,
"blk.%d.nextn.enorm"
},
{
LLM_TENSOR_NEXTN_HNORM
,
"blk.%d.nextn.hnorm"
},
{
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD
,
"blk.%d.nextn.shared_head_head"
},
{
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM
,
"blk.%d.nextn.shared_head_norm"
},
{
LLM_TENSOR_LAYER_OUT_NORM
,
"blk.%d.layer_output_norm"
},
},
},
{
{
LLM_ARCH_DOTS1
,
LLM_ARCH_DOTS1
,
{
{
...
...
llama/llama.cpp/src/llama-arch.h
View file @
544b6739
...
@@ -9,6 +9,7 @@
...
@@ -9,6 +9,7 @@
//
//
enum
llm_arch
{
enum
llm_arch
{
LLM_ARCH_CLIP
,
LLM_ARCH_LLAMA
,
LLM_ARCH_LLAMA
,
LLM_ARCH_LLAMA4
,
LLM_ARCH_LLAMA4
,
LLM_ARCH_DECI
,
LLM_ARCH_DECI
,
...
@@ -89,6 +90,7 @@ enum llm_arch {
...
@@ -89,6 +90,7 @@ enum llm_arch {
LLM_ARCH_WAVTOKENIZER_DEC
,
LLM_ARCH_WAVTOKENIZER_DEC
,
LLM_ARCH_PLM
,
LLM_ARCH_PLM
,
LLM_ARCH_BAILINGMOE
,
LLM_ARCH_BAILINGMOE
,
LLM_ARCH_BAILINGMOE2
,
LLM_ARCH_DOTS1
,
LLM_ARCH_DOTS1
,
LLM_ARCH_ARCEE
,
LLM_ARCH_ARCEE
,
LLM_ARCH_ERNIE4_5
,
LLM_ARCH_ERNIE4_5
,
...
@@ -139,6 +141,8 @@ enum llm_kv {
...
@@ -139,6 +141,8 @@ enum llm_kv {
LLM_KV_EXPERT_COUNT
,
LLM_KV_EXPERT_COUNT
,
LLM_KV_EXPERT_USED_COUNT
,
LLM_KV_EXPERT_USED_COUNT
,
LLM_KV_EXPERT_SHARED_COUNT
,
LLM_KV_EXPERT_SHARED_COUNT
,
LLM_KV_EXPERT_GROUP_COUNT
,
LLM_KV_EXPERT_GROUP_USED_COUNT
,
LLM_KV_EXPERT_WEIGHTS_SCALE
,
LLM_KV_EXPERT_WEIGHTS_SCALE
,
LLM_KV_EXPERT_WEIGHTS_NORM
,
LLM_KV_EXPERT_WEIGHTS_NORM
,
LLM_KV_EXPERT_GATING_FUNC
,
LLM_KV_EXPERT_GATING_FUNC
,
...
...
llama/llama.cpp/src/llama-batch.h
View file @
544b6739
...
@@ -123,7 +123,7 @@ private:
...
@@ -123,7 +123,7 @@ private:
uint32_t
n_seq_max
;
uint32_t
n_seq_max
;
uint32_t
n_outputs
;
uint32_t
n_outputs
;
std
::
array
<
llama_seq_id
,
1
>
seq_id_0
=
{
0
};
// default sequence id
std
::
array
<
llama_seq_id
,
1
>
seq_id_0
=
{
{
0
}
};
// default sequence id
std
::
vector
<
llama_pos
>
pos
;
std
::
vector
<
llama_pos
>
pos
;
std
::
vector
<
int32_t
>
n_seq_id
;
std
::
vector
<
int32_t
>
n_seq_id
;
...
...
llama/llama.cpp/src/llama-chat.cpp
View file @
544b6739
...
@@ -63,6 +63,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
...
@@ -63,6 +63,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{
"megrez"
,
LLM_CHAT_TEMPLATE_MEGREZ
},
{
"megrez"
,
LLM_CHAT_TEMPLATE_MEGREZ
},
{
"yandex"
,
LLM_CHAT_TEMPLATE_YANDEX
},
{
"yandex"
,
LLM_CHAT_TEMPLATE_YANDEX
},
{
"bailing"
,
LLM_CHAT_TEMPLATE_BAILING
},
{
"bailing"
,
LLM_CHAT_TEMPLATE_BAILING
},
{
"bailing-think"
,
LLM_CHAT_TEMPLATE_BAILING_THINK
},
{
"bailing2"
,
LLM_CHAT_TEMPLATE_BAILING2
},
{
"llama4"
,
LLM_CHAT_TEMPLATE_LLAMA4
},
{
"llama4"
,
LLM_CHAT_TEMPLATE_LLAMA4
},
{
"smolvlm"
,
LLM_CHAT_TEMPLATE_SMOLVLM
},
{
"smolvlm"
,
LLM_CHAT_TEMPLATE_SMOLVLM
},
{
"hunyuan-moe"
,
LLM_CHAT_TEMPLATE_HUNYUAN_MOE
},
{
"hunyuan-moe"
,
LLM_CHAT_TEMPLATE_HUNYUAN_MOE
},
...
@@ -191,6 +193,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
...
@@ -191,6 +193,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return
LLM_CHAT_TEMPLATE_YANDEX
;
return
LLM_CHAT_TEMPLATE_YANDEX
;
}
else
if
(
tmpl_contains
(
"<role>ASSISTANT</role>"
)
&&
tmpl_contains
(
"'HUMAN'"
))
{
}
else
if
(
tmpl_contains
(
"<role>ASSISTANT</role>"
)
&&
tmpl_contains
(
"'HUMAN'"
))
{
return
LLM_CHAT_TEMPLATE_BAILING
;
return
LLM_CHAT_TEMPLATE_BAILING
;
}
else
if
(
tmpl_contains
(
"<role>ASSISTANT</role>"
)
&&
tmpl_contains
(
"
\"
HUMAN
\"
"
)
&&
tmpl_contains
(
"<think>"
))
{
return
LLM_CHAT_TEMPLATE_BAILING_THINK
;
}
else
if
(
tmpl_contains
(
"<role>ASSISTANT</role>"
)
&&
tmpl_contains
(
"<role>HUMAN</role>"
)
&&
tmpl_contains
(
"<|role_end|>"
))
{
return
LLM_CHAT_TEMPLATE_BAILING2
;
}
else
if
(
tmpl_contains
(
"<|header_start|>"
)
&&
tmpl_contains
(
"<|header_end|>"
))
{
}
else
if
(
tmpl_contains
(
"<|header_start|>"
)
&&
tmpl_contains
(
"<|header_end|>"
))
{
return
LLM_CHAT_TEMPLATE_LLAMA4
;
return
LLM_CHAT_TEMPLATE_LLAMA4
;
}
else
if
(
tmpl_contains
(
"<|endofuserprompt|>"
))
{
}
else
if
(
tmpl_contains
(
"<|endofuserprompt|>"
))
{
...
@@ -644,8 +650,8 @@ int32_t llm_chat_apply_template(
...
@@ -644,8 +650,8 @@ int32_t llm_chat_apply_template(
if
(
add_ass
)
{
if
(
add_ass
)
{
ss
<<
" Ассистент:[SEP]"
;
ss
<<
" Ассистент:[SEP]"
;
}
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_BAILING
)
{
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_BAILING
||
tmpl
==
LLM_CHAT_TEMPLATE_BAILING_THINK
)
{
// Bailing (Ling) template
// Bailing (Ling
/Ring
) template
for
(
auto
message
:
chat
)
{
for
(
auto
message
:
chat
)
{
std
::
string
role
(
message
->
role
);
std
::
string
role
(
message
->
role
);
...
@@ -658,6 +664,33 @@ int32_t llm_chat_apply_template(
...
@@ -658,6 +664,33 @@ int32_t llm_chat_apply_template(
ss
<<
"<role>"
<<
role
<<
"</role>"
<<
message
->
content
;
ss
<<
"<role>"
<<
role
<<
"</role>"
<<
message
->
content
;
}
}
if
(
add_ass
)
{
ss
<<
"<role>ASSISTANT</role>"
;
if
(
tmpl
==
LLM_CHAT_TEMPLATE_BAILING_THINK
)
{
ss
<<
"<think>"
;
}
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_BAILING2
)
{
// Bailing2 (Ling 2.0) template
bool
has_system
=
!
chat
.
empty
()
&&
std
::
string
(
chat
[
0
]
->
role
)
==
"system"
;
if
(
!
has_system
)
{
ss
<<
"<role>SYSTEM</role>detailed thinking off<|role_end|>"
;
}
for
(
auto
message
:
chat
)
{
std
::
string
role
(
message
->
role
);
if
(
role
==
"user"
)
{
role
=
"HUMAN"
;
}
else
{
std
::
transform
(
role
.
begin
(),
role
.
end
(),
role
.
begin
(),
::
toupper
);
}
ss
<<
"<role>"
<<
role
<<
"</role>"
<<
message
->
content
<<
"<|role_end|>"
;
}
if
(
add_ass
)
{
if
(
add_ass
)
{
ss
<<
"<role>ASSISTANT</role>"
;
ss
<<
"<role>ASSISTANT</role>"
;
}
}
...
...
llama/llama.cpp/src/llama-chat.h
View file @
544b6739
...
@@ -42,6 +42,8 @@ enum llm_chat_template {
...
@@ -42,6 +42,8 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_MEGREZ
,
LLM_CHAT_TEMPLATE_MEGREZ
,
LLM_CHAT_TEMPLATE_YANDEX
,
LLM_CHAT_TEMPLATE_YANDEX
,
LLM_CHAT_TEMPLATE_BAILING
,
LLM_CHAT_TEMPLATE_BAILING
,
LLM_CHAT_TEMPLATE_BAILING_THINK
,
LLM_CHAT_TEMPLATE_BAILING2
,
LLM_CHAT_TEMPLATE_LLAMA4
,
LLM_CHAT_TEMPLATE_LLAMA4
,
LLM_CHAT_TEMPLATE_SMOLVLM
,
LLM_CHAT_TEMPLATE_SMOLVLM
,
LLM_CHAT_TEMPLATE_DOTS1
,
LLM_CHAT_TEMPLATE_DOTS1
,
...
...
llama/llama.cpp/src/llama-context.cpp
View file @
544b6739
...
@@ -2345,7 +2345,8 @@ llama_context * llama_init_from_model(
...
@@ -2345,7 +2345,8 @@ llama_context * llama_init_from_model(
return
nullptr
;
return
nullptr
;
}
}
if
(
params
.
pooling_type
!=
model
->
hparams
.
pooling_type
)
{
if
(
params
.
pooling_type
!=
LLAMA_POOLING_TYPE_UNSPECIFIED
&&
params
.
pooling_type
!=
model
->
hparams
.
pooling_type
)
{
//user-specified pooling-type is different from the model default
//user-specified pooling-type is different from the model default
LLAMA_LOG_WARN
(
"%s: model default pooling_type is [%d], but [%d] was specified
\n
"
,
__func__
,
LLAMA_LOG_WARN
(
"%s: model default pooling_type is [%d], but [%d] was specified
\n
"
,
__func__
,
model
->
hparams
.
pooling_type
,
params
.
pooling_type
);
model
->
hparams
.
pooling_type
,
params
.
pooling_type
);
...
...
llama/llama.cpp/src/llama-graph.cpp
View file @
544b6739
...
@@ -261,12 +261,17 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
...
@@ -261,12 +261,17 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
}
}
}
}
static
void
print_mask
(
float
*
data
,
int64_t
n_tokens
,
int64_t
n_kv
,
int64_t
n_swa
,
llama_swa_type
swa_type
)
{
static
void
print_mask
(
const
float
*
data
,
int64_t
n_tokens
,
int64_t
n_kv
,
int64_t
n_swa
,
llama_swa_type
swa_type
)
{
LLAMA_LOG_DEBUG
(
"%s: === Attention mask ===
\n
"
,
__func__
);
LLAMA_LOG_DEBUG
(
"%s: === Attention mask ===
\n
"
,
__func__
);
const
char
*
swa_type_str
=
(
swa_type
==
LLAMA_SWA_TYPE_NONE
)
?
"LLAMA_SWA_TYPE_NONE"
:
const
char
*
swa_type_str
=
"unknown"
;
(
swa_type
==
LLAMA_SWA_TYPE_STANDARD
)
?
"LLAMA_SWA_TYPE_STANDARD"
:
(
swa_type
==
LLAMA_SWA_TYPE_CHUNKED
)
?
"LLAMA_SWA_TYPE_CHUNKED"
:
switch
(
swa_type
)
{
(
swa_type
==
LLAMA_SWA_TYPE_SYMMETRIC
)
?
"LLAMA_SWA_TYPE_SYMMETRIC"
:
"unknown"
;
case
LLAMA_SWA_TYPE_NONE
:
swa_type_str
=
"LLAMA_SWA_TYPE_NONE"
;
break
;
case
LLAMA_SWA_TYPE_STANDARD
:
swa_type_str
=
"LLAMA_SWA_TYPE_STANDARD"
;
break
;
case
LLAMA_SWA_TYPE_CHUNKED
:
swa_type_str
=
"LLAMA_SWA_TYPE_CHUNKED"
;
break
;
case
LLAMA_SWA_TYPE_SYMMETRIC
:
swa_type_str
=
"LLAMA_SWA_TYPE_SYMMETRIC"
;
break
;
};
LLAMA_LOG_DEBUG
(
"%s: n_swa : %d, n_kv: %d, swq_type: %s
\n
"
,
__func__
,
(
int
)
n_swa
,
(
int
)
n_kv
,
swa_type_str
);
LLAMA_LOG_DEBUG
(
"%s: n_swa : %d, n_kv: %d, swq_type: %s
\n
"
,
__func__
,
(
int
)
n_swa
,
(
int
)
n_kv
,
swa_type_str
);
LLAMA_LOG_DEBUG
(
"%s: '0' = can attend, '∞' = masked
\n
"
,
__func__
);
LLAMA_LOG_DEBUG
(
"%s: '0' = can attend, '∞' = masked
\n
"
,
__func__
);
LLAMA_LOG_DEBUG
(
"%s: Rows = query tokens, Columns = key/value tokens
\n\n
"
,
__func__
);
LLAMA_LOG_DEBUG
(
"%s: Rows = query tokens, Columns = key/value tokens
\n\n
"
,
__func__
);
...
@@ -295,51 +300,68 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
...
@@ -295,51 +300,68 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
const
int64_t
n_kv
=
ubatch
->
n_tokens
;
const
int64_t
n_kv
=
ubatch
->
n_tokens
;
const
int64_t
n_tokens
=
ubatch
->
n_tokens
;
const
int64_t
n_tokens
=
ubatch
->
n_tokens
;
GGML_ASSERT
(
kq_mask
);
const
auto
fill_mask
=
[
&
](
float
*
data
,
int
n_swa
,
llama_swa_type
swa_type
)
{
GGML_ASSERT
(
ggml_backend_buffer_is_host
(
kq_mask
->
buffer
));
float
*
data
=
(
float
*
)
kq_mask
->
data
;
// [TAG_NO_CACHE_ISWA]
GGML_ASSERT
(
hparams
.
swa_type
==
LLAMA_SWA_TYPE_NONE
&&
"TODO: implement"
);
for
(
int
h
=
0
;
h
<
1
;
++
h
)
{
for
(
int
h
=
0
;
h
<
1
;
++
h
)
{
for
(
int
i1
=
0
;
i1
<
n_tokens
;
++
i1
)
{
for
(
int
i1
=
0
;
i1
<
n_tokens
;
++
i1
)
{
const
llama_seq_id
s1
=
ubatch
->
seq_id
[
i1
][
0
];
const
llama_seq_id
s1
=
ubatch
->
seq_id
[
i1
][
0
];
const
llama_pos
p1
=
ubatch
->
pos
[
i1
];
for
(
int
i0
=
0
;
i0
<
n_tokens
;
++
i0
)
{
const
uint64_t
idst
=
h
*
(
n_kv
*
n_tokens
)
+
i1
*
n_kv
;
float
f
=
-
INFINITY
;
for
(
int
s
=
0
;
s
<
ubatch
->
n_seq_id
[
i0
]
;
++
s
)
{
for
(
int
i0
=
0
;
i0
<
n_tokens
;
++
i0
)
{
const
llama_seq_id
s0
=
ubatch
->
seq_id
[
i0
][
0
];
const
llama_seq_id
s0
=
ubatch
->
seq_id
[
i0
][
0
];
const
llama_pos
p0
=
ubatch
->
pos
[
i0
];
// mask different sequences
if
(
s0
!=
s1
)
{
if
(
s0
!=
s1
)
{
continue
;
// skip different sequences
continue
;
}
}
if
(
cparams
.
causal_attn
&&
ubatch
->
pos
[
i0
]
>
ubatch
->
pos
[
i1
])
{
// mask future tokens
continue
;
// skip future tokens for causal attention
if
(
cparams
.
causal_attn
&&
p0
>
p1
)
{
continue
;
}
}
//
TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
//
apply SWA if any
//
if (hparams
.
is_masked_swa(
ubatch->pos[i0], ubatch->pos[i1]
)) {
if
(
llama_
hparams
::
is_masked_swa
(
n_swa
,
swa_type
,
p0
,
p1
))
{
//
continue;
// skip masked tokens for SWA
continue
;
//
}
}
// TODO: reimplement this like in llama_kv_cache_unified
data
[
idst
+
i0
]
=
hparams
.
use_alibi
?
-
std
::
abs
(
p0
-
p1
)
:
0.0
f
;
if
(
hparams
.
use_alibi
)
{
f
=
-
std
::
abs
(
ubatch
->
pos
[
i0
]
-
ubatch
->
pos
[
i1
]);
}
else
{
f
=
0.0
f
;
}
}
}
}
data
[
h
*
(
n_kv
*
n_tokens
)
+
i1
*
n_kv
+
i0
]
=
f
;
}
}
};
{
GGML_ASSERT
(
self_kq_mask
);
GGML_ASSERT
(
ggml_backend_buffer_is_host
(
self_kq_mask
->
buffer
));
float
*
data
=
(
float
*
)
self_kq_mask
->
data
;
std
::
fill
(
data
,
data
+
ggml_nelements
(
self_kq_mask
),
-
INFINITY
);
fill_mask
(
data
,
0
,
LLAMA_SWA_TYPE_NONE
);
if
(
debug
)
{
print_mask
(
data
,
n_tokens
,
n_kv
,
0
,
LLAMA_SWA_TYPE_NONE
);
}
}
}
}
if
(
hparams
.
swa_type
!=
LLAMA_SWA_TYPE_NONE
)
{
GGML_ASSERT
(
self_kq_mask_swa
);
GGML_ASSERT
(
ggml_backend_buffer_is_host
(
self_kq_mask_swa
->
buffer
));
float
*
data
=
(
float
*
)
self_kq_mask_swa
->
data
;
std
::
fill
(
data
,
data
+
ggml_nelements
(
self_kq_mask_swa
),
-
INFINITY
);
fill_mask
(
data
,
hparams
.
n_swa
,
hparams
.
swa_type
);
if
(
debug
)
{
if
(
debug
)
{
print_mask
(
data
,
n_tokens
,
n_kv
,
hparams
.
n_swa
,
hparams
.
swa_type
);
print_mask
(
data
,
n_tokens
,
n_kv
,
hparams
.
n_swa
,
hparams
.
swa_type
);
}
}
}
}
}
void
llm_graph_input_attn_kv
::
set_input
(
const
llama_ubatch
*
ubatch
)
{
void
llm_graph_input_attn_kv
::
set_input
(
const
llama_ubatch
*
ubatch
)
{
...
@@ -928,6 +950,31 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
...
@@ -928,6 +950,31 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb
(
selection_probs
,
"ffn_moe_probs_biased"
,
il
);
cb
(
selection_probs
,
"ffn_moe_probs_biased"
,
il
);
}
}
// select top n_group_used expert groups
// https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/e815299b0bcbac849fa540c768ef21845365c9eb/modeling_deepseek.py#L440-L457
if
(
hparams
.
n_expert_groups
>
1
&&
n_tokens
>
0
)
{
const
int64_t
n_exp_per_group
=
n_expert
/
hparams
.
n_expert_groups
;
// organize experts into n_expert_groups
ggml_tensor
*
selection_groups
=
ggml_reshape_3d
(
ctx0
,
selection_probs
,
n_exp_per_group
,
hparams
.
n_expert_groups
,
n_tokens
);
// [n_exp_per_group, n_expert_groups, n_tokens]
ggml_tensor
*
group_scores
=
ggml_top_k
(
ctx0
,
selection_groups
,
2
);
// [2, n_expert_groups, n_tokens]
group_scores
=
ggml_get_rows
(
ctx0
,
ggml_reshape_4d
(
ctx0
,
selection_groups
,
1
,
selection_groups
->
ne
[
0
],
selection_groups
->
ne
[
1
],
selection_groups
->
ne
[
2
]),
group_scores
);
// [1, 2, n_expert_groups, n_tokens]
// get top n_group_used expert groups
group_scores
=
ggml_sum_rows
(
ctx0
,
ggml_reshape_3d
(
ctx0
,
group_scores
,
group_scores
->
ne
[
1
],
group_scores
->
ne
[
2
],
group_scores
->
ne
[
3
]));
// [1, n_expert_groups, n_tokens]
group_scores
=
ggml_reshape_2d
(
ctx0
,
group_scores
,
group_scores
->
ne
[
1
],
group_scores
->
ne
[
2
]);
// [n_expert_groups, n_tokens]
ggml_tensor
*
expert_groups
=
ggml_top_k
(
ctx0
,
group_scores
,
hparams
.
n_group_used
);
// [n_group_used, n_tokens]
cb
(
expert_groups
,
"ffn_moe_group_topk"
,
il
);
// mask out the other groups
selection_probs
=
ggml_get_rows
(
ctx0
,
selection_groups
,
expert_groups
);
// [n_exp_per_group, n_group_used, n_tokens]
selection_probs
=
ggml_set_rows
(
ctx0
,
ggml_scale_bias
(
ctx0
,
selection_groups
,
0.0
f
,
-
INFINITY
),
selection_probs
,
expert_groups
);
// [n_exp_per_group, n_expert_groups, n_tokens]
selection_probs
=
ggml_reshape_2d
(
ctx0
,
selection_probs
,
n_expert
,
n_tokens
);
// [n_expert, n_tokens]
cb
(
selection_probs
,
"ffn_moe_probs_masked"
,
il
);
}
// select experts
// select experts
ggml_tensor
*
selected_experts
=
ggml_top_k
(
ctx0
,
selection_probs
,
n_expert_used
);
// [n_expert_used, n_tokens]
ggml_tensor
*
selected_experts
=
ggml_top_k
(
ctx0
,
selection_probs
,
n_expert_used
);
// [n_expert_used, n_tokens]
cb
(
selected_experts
->
src
[
0
],
"ffn_moe_argsort"
,
il
);
cb
(
selected_experts
->
src
[
0
],
"ffn_moe_argsort"
,
il
);
...
@@ -959,6 +1006,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
...
@@ -959,6 +1006,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor
*
weights_sum
=
ggml_sum_rows
(
ctx0
,
weights
);
// [1, n_tokens]
ggml_tensor
*
weights_sum
=
ggml_sum_rows
(
ctx0
,
weights
);
// [1, n_tokens]
cb
(
weights_sum
,
"ffn_moe_weights_sum"
,
il
);
cb
(
weights_sum
,
"ffn_moe_weights_sum"
,
il
);
if
(
arch
==
LLM_ARCH_BAILINGMOE2
)
{
weights_sum
=
ggml_scale_bias
(
ctx0
,
weights_sum
,
1.0
,
1e-20
);
cb
(
weights_sum
,
"ffn_moe_weights_sum_biased"
,
il
);
}
weights
=
ggml_div
(
ctx0
,
weights
,
weights_sum
);
// [n_expert_used, n_tokens]
weights
=
ggml_div
(
ctx0
,
weights
,
weights_sum
);
// [n_expert_used, n_tokens]
cb
(
weights
,
"ffn_moe_weights_norm"
,
il
);
cb
(
weights
,
"ffn_moe_weights_norm"
,
il
);
...
@@ -1299,12 +1351,9 @@ ggml_tensor * llm_graph_context::build_attn_mha(
...
@@ -1299,12 +1351,9 @@ ggml_tensor * llm_graph_context::build_attn_mha(
k
=
ggml_permute
(
ctx0
,
k
,
0
,
2
,
1
,
3
);
k
=
ggml_permute
(
ctx0
,
k
,
0
,
2
,
1
,
3
);
v
=
ggml_permute
(
ctx0
,
v
,
0
,
2
,
1
,
3
);
v
=
ggml_permute
(
ctx0
,
v
,
0
,
2
,
1
,
3
);
const
auto
n_kv
=
k
->
ne
[
1
];
ggml_tensor
*
cur
;
ggml_tensor
*
cur
;
// TODO: replace hardcoded padding with ggml-provided padding
if
(
cparams
.
flash_attn
&&
kq_b
==
nullptr
)
{
if
(
cparams
.
flash_attn
&&
(
n_kv
%
256
==
0
)
&&
kq_b
==
nullptr
)
{
GGML_ASSERT
(
kq_b
==
nullptr
&&
"Flash attention does not support KQ bias yet"
);
GGML_ASSERT
(
kq_b
==
nullptr
&&
"Flash attention does not support KQ bias yet"
);
if
(
v_trans
)
{
if
(
v_trans
)
{
...
@@ -1419,10 +1468,20 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
...
@@ -1419,10 +1468,20 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
auto
inp
=
std
::
make_unique
<
llm_graph_input_attn_no_cache
>
(
hparams
,
cparams
);
auto
inp
=
std
::
make_unique
<
llm_graph_input_attn_no_cache
>
(
hparams
,
cparams
);
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
inp
->
kq_mask
=
ggml_new_tensor_4d
(
ctx0
,
GGML_TYPE_F32
,
n_tokens
,
GGML_PAD
(
n_tokens
,
GGML_KQ_MASK_PAD
),
1
,
1
);
inp
->
self_
kq_mask
=
ggml_new_tensor_4d
(
ctx0
,
GGML_TYPE_F32
,
n_tokens
,
GGML_PAD
(
n_tokens
,
GGML_KQ_MASK_PAD
),
1
,
1
);
ggml_set_input
(
inp
->
kq_mask
);
ggml_set_input
(
inp
->
self_
kq_mask
);
inp
->
kq_mask_cnv
=
cparams
.
flash_attn
?
ggml_cast
(
ctx0
,
inp
->
kq_mask
,
GGML_TYPE_F16
)
:
inp
->
kq_mask
;
inp
->
self_kq_mask_cnv
=
cparams
.
flash_attn
?
ggml_cast
(
ctx0
,
inp
->
self_kq_mask
,
GGML_TYPE_F16
)
:
inp
->
self_kq_mask
;
if
(
hparams
.
swa_type
!=
LLAMA_SWA_TYPE_NONE
)
{
inp
->
self_kq_mask_swa
=
ggml_new_tensor_4d
(
ctx0
,
GGML_TYPE_F32
,
n_tokens
,
GGML_PAD
(
n_tokens
,
GGML_KQ_MASK_PAD
),
1
,
1
);
ggml_set_input
(
inp
->
self_kq_mask_swa
);
inp
->
self_kq_mask_swa_cnv
=
cparams
.
flash_attn
?
ggml_cast
(
ctx0
,
inp
->
self_kq_mask_swa
,
GGML_TYPE_F16
)
:
inp
->
self_kq_mask_swa
;
}
else
{
inp
->
self_kq_mask_swa
=
nullptr
;
inp
->
self_kq_mask_swa_cnv
=
nullptr
;
}
return
(
llm_graph_input_attn_no_cache
*
)
res
->
add_input
(
std
::
move
(
inp
));
return
(
llm_graph_input_attn_no_cache
*
)
res
->
add_input
(
std
::
move
(
inp
));
}
}
...
@@ -1447,7 +1506,9 @@ ggml_tensor * llm_graph_context::build_attn(
...
@@ -1447,7 +1506,9 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_build_forward_expand
(
gf
,
k_cur
);
ggml_build_forward_expand
(
gf
,
k_cur
);
ggml_build_forward_expand
(
gf
,
v_cur
);
ggml_build_forward_expand
(
gf
,
v_cur
);
const
auto
&
kq_mask
=
inp
->
get_kq_mask
();
const
bool
is_swa
=
hparams
.
is_swa
(
il
);
const
auto
&
kq_mask
=
is_swa
?
inp
->
get_kq_mask_swa
()
:
inp
->
get_kq_mask
();
// [TAG_NO_CACHE_PAD]
// [TAG_NO_CACHE_PAD]
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
...
...
llama/llama.cpp/src/llama-graph.h
View file @
544b6739
...
@@ -257,10 +257,14 @@ public:
...
@@ -257,10 +257,14 @@ public:
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
get_kq_mask
()
const
{
return
kq_mask_cnv
;
}
ggml_tensor
*
get_kq_mask
()
const
{
return
self_kq_mask_cnv
;
}
ggml_tensor
*
get_kq_mask_swa
()
const
{
return
self_kq_mask_swa_cnv
;
}
ggml_tensor
*
kq_mask
=
nullptr
;
// F32 [n_tokens, n_batch, 1, 1]
// n_tokens == n_batch
ggml_tensor
*
kq_mask_cnv
=
nullptr
;
// [n_tokens, n_batch, 1, 1]
ggml_tensor
*
self_kq_mask
=
nullptr
;
// F32 [n_tokens, n_batch/n_stream, 1, n_stream]
ggml_tensor
*
self_kq_mask_cnv
=
nullptr
;
// [n_tokens, n_batch/n_stream, 1, n_stream]
ggml_tensor
*
self_kq_mask_swa
=
nullptr
;
// F32 [n_tokens, n_batch/n_stream, 1, n_stream]
ggml_tensor
*
self_kq_mask_swa_cnv
=
nullptr
;
// [n_tokens, n_batch/n_stream, 1, n_stream]
const
llama_hparams
hparams
;
const
llama_hparams
hparams
;
const
llama_cparams
cparams
;
const
llama_cparams
cparams
;
...
...
llama/llama.cpp/src/llama-hparams.h
View file @
544b6739
...
@@ -74,6 +74,8 @@ struct llama_hparams {
...
@@ -74,6 +74,8 @@ struct llama_hparams {
uint32_t
n_ff_chexp
=
0
;
uint32_t
n_ff_chexp
=
0
;
uint32_t
n_expert_shared
=
0
;
uint32_t
n_expert_shared
=
0
;
uint32_t
n_norm_groups
=
0
;
uint32_t
n_norm_groups
=
0
;
uint32_t
n_expert_groups
=
0
;
uint32_t
n_group_used
=
0
;
uint32_t
n_group_experts
=
0
;
uint32_t
n_group_experts
=
0
;
float
expert_group_scale
=
0
.
05
f
;
float
expert_group_scale
=
0
.
05
f
;
...
...
llama/llama.cpp/src/llama-model.cpp
View file @
544b6739
...
@@ -114,9 +114,12 @@ const char * llm_type_name(llm_type type) {
...
@@ -114,9 +114,12 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
case LLM_TYPE_A13B: return "A13B";
case LLM_TYPE_A13B: return "A13B";
case LLM_TYPE_7B_A1B: return "7B.A1B";
case LLM_TYPE_8B_A1B: return "8B.A1B";
case LLM_TYPE_8B_A1B: return "8B.A1B";
case LLM_TYPE_16B_A1B: return "16B.A1B";
case LLM_TYPE_21B_A3B: return "21B.A3B";
case LLM_TYPE_21B_A3B: return "21B.A3B";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_100B_A6B: return "100B.A6B";
case LLM_TYPE_106B_A12B: return "106B.A12B";
case LLM_TYPE_106B_A12B: return "106B.A12B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
case LLM_TYPE_300B_A47B: return "300B.A47B";
case LLM_TYPE_300B_A47B: return "300B.A47B";
...
@@ -401,6 +404,19 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
...
@@ -401,6 +404,19 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
// add the device default buffer type
// add the device default buffer type
buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
// add the device extra buffer type (if any)
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
if (ggml_backend_dev_get_extra_bufts_fn) {
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
while (extra_bufts && *extra_bufts) {
buft_list.emplace_back(dev, *extra_bufts);
++extra_bufts;
}
}
return buft_list;
return buft_list;
}
}
...
@@ -421,11 +437,8 @@ struct llama_model::impl {
...
@@ -421,11 +437,8 @@ struct llama_model::impl {
llama_mlocks mlock_bufs;
llama_mlocks mlock_bufs;
llama_mlocks mlock_mmaps;
llama_mlocks mlock_mmaps;
// contexts where the model tensors metadata is stored
// contexts where the model tensors metadata is stored as well ass the corresponding buffers:
std::vector<ggml_context_ptr> ctxs;
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
// the model memory buffers for the tensor data
std::vector<ggml_backend_buffer_ptr> bufs;
buft_list_t cpu_buft_list;
buft_list_t cpu_buft_list;
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
...
@@ -478,7 +491,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -478,7 +491,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
// everything past this point is not vocab-related
// everything past this point is not vocab-related
if (hparams.vocab_only) {
// for CLIP models, we only need to load tensors, no hparams
if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
return;
return;
}
}
...
@@ -487,6 +501,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -487,6 +501,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false);
ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
...
@@ -502,8 +518,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -502,8 +518,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
if (hparams.n_expert > 0) {
if (hparams.n_expert > 0) {
GGML_ASSERT(hparams.n_expert_used > 0);
GGML_ASSERT(hparams.n_expert_used > 0);
GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert);
if (hparams.n_expert_groups > 1) {
GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0);
GGML_ASSERT(hparams.n_group_used > 0);
GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups);
}
} else {
} else {
GGML_ASSERT(hparams.n_expert_used == 0);
GGML_ASSERT(hparams.n_expert_used == 0);
GGML_ASSERT(hparams.n_expert_groups == 0);
}
}
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
...
@@ -1845,8 +1868,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -1845,8 +1868,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
switch (hparams.n_embd) {
// TODO: Add llm type label (not sure this is useful)
case 1536: type = LLM_TYPE_7B_A1B; break;
case 2048: case 2560: type = LLM_TYPE_3B; break;
case 4096: type = LLM_TYPE_32B; break;
default: type = LLM_TYPE_UNKNOWN;
default: type = LLM_TYPE_UNKNOWN;
}
}
...
@@ -1902,6 +1927,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -1902,6 +1927,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
default: type = LLM_TYPE_UNKNOWN;
}
}
} break;
} break;
case LLM_ARCH_BAILINGMOE2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
// TODO: when MTP is implemented, this should probably be updated if needed
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
switch (hparams.n_layer) {
case 20: type = LLM_TYPE_16B_A1B; break;
case 21: type = LLM_TYPE_16B_A1B; break;
case 32: type = LLM_TYPE_100B_A6B; break;
case 33: type = LLM_TYPE_100B_A6B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_DOTS1:
case LLM_ARCH_DOTS1:
{
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
...
@@ -2196,7 +2244,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
...
@@ -2196,7 +2244,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
max_n_tensors += n_layer*2; // duplicated rope freq tensors
max_n_tensors += n_layer*2; // duplicated rope freq tensors
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
struct ggml_backend_buft_comparator {
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
return ggml_backend_buft_name(lhs) < ggml_backend_buft_name(rhs);
}
};
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft);
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
if (it == ctx_map.end()) {
...
@@ -2211,12 +2266,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
...
@@ -2211,12 +2266,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
throw std::runtime_error(format("failed to create ggml context"));
throw std::runtime_error(format("failed to create ggml context"));
}
}
ctx_map[buft] = ctx;
ctx_map.emplace(buft, ctx);
pimpl->ctxs.emplace_back(ctx);
return ctx;
return ctx;
}
}
return it->second;
return it->second
.get()
;
};
};
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
...
@@ -5534,6 +5588,70 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
...
@@ -5534,6 +5588,70 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
}
}
} break;
} break;
case LLM_ARCH_BAILINGMOE2:
{
const int64_t n_ff_exp = hparams.n_ff_exp;
const int64_t n_expert_shared = hparams.n_expert_shared;
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
for (int i = 0; i < n_layer; ++i) {
int flags = 0;
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
// skip all tensors in the NextN layers
flags |= TENSOR_SKIP;
}
auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
} else { // Dense layers
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
}
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
}
}
} break;
case LLM_ARCH_DOTS1:
case LLM_ARCH_DOTS1:
{
{
const int64_t n_ff_exp = hparams.n_ff_exp;
const int64_t n_ff_exp = hparams.n_ff_exp;
...
@@ -6079,16 +6197,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
...
@@ -6079,16 +6197,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
pimpl->mappings.reserve(ml.mappings.size());
pimpl->mappings.reserve(ml.mappings.size());
// create the backend buffers
// create the backend buffers
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf
_map
s;
ctx_bufs.reserve(ctx_map.size());
ctx_buf
_map
s.reserve(ctx_map.size());
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
pimpl->bufs.reserve(n_max_backend_buffer);
pimpl->
ctxs_
bufs.reserve(n_max_backend_buffer);
for (auto & it : ctx_map) {
for (auto & [buft, ctx_ptr] : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx = ctx_ptr.get();
ggml_context * ctx = it.second;
// skip contexts without tensors
// skip contexts without tensors
if (ggml_get_first_tensor(ctx) == nullptr) {
if (ggml_get_first_tensor(ctx) == nullptr) {
...
@@ -6112,6 +6229,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
...
@@ -6112,6 +6229,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
ggml_backend_buffer_t buf = nullptr;
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
// only the mmap region containing the tensors in the model is mapped to the backend buffer
// only the mmap region containing the tensors in the model is mapped to the backend buffer
...
@@ -6124,20 +6242,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
...
@@ -6124,20 +6242,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
continue;
continue;
}
}
const size_t max_size = ggml_get_max_tensor_size(ctx);
const size_t max_size = ggml_get_max_tensor_size(ctx);
ggml_backend_buffer_t
buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
if (buf == nullptr) {
if (buf == nullptr) {
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
}
}
pimpl->bufs.emplace_back(buf);
buf_map.emplace(idx, buf);
buf_map.emplace(idx, buf);
}
}
}
}
else {
else {
ggml_backend_buffer_t
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (buf == nullptr) {
if (buf == nullptr) {
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
}
}
pimpl->bufs.emplace_back(buf);
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
pimpl->mlock_bufs.emplace_back(new llama_mlock);
pimpl->mlock_bufs.emplace_back(new llama_mlock);
auto & mlock_buf = pimpl->mlock_bufs.back();
auto & mlock_buf = pimpl->mlock_bufs.back();
...
@@ -6148,10 +6264,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
...
@@ -6148,10 +6264,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
buf_map.emplace(idx, buf);
buf_map.emplace(idx, buf);
}
}
}
}
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);
if (pimpl->bufs.empty()) {
throw std::runtime_error("failed to allocate buffer");
}
for (auto & buf : buf_map) {
for (auto & buf : buf_map) {
// indicate that this buffer contains weights
// indicate that this buffer contains weights
...
@@ -6159,7 +6272,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
...
@@ -6159,7 +6272,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
}
}
ctx_bufs.emplace_back(ctx, buf_map);
ctx_buf
_map
s.emplace_back(ctx, buf_map);
}
}
if (llama_supports_gpu_offload()) {
if (llama_supports_gpu_offload()) {
...
@@ -6177,22 +6290,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
...
@@ -6177,22 +6290,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
// print memory requirements per buffer type
// print memory requirements per buffer type
for (auto & buf : pimpl->bufs) {
for (auto &
[_,
buf
]
: pimpl->
ctxs_
bufs) {
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
}
}
// populate tensors_by_name
// populate tensors_by_name
for (auto & ctx : pimpl->ctxs) {
for (auto &
[
ctx
, _]
: pimpl->ctxs
_bufs
) {
for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
tensors_by_name.emplace_back(ggml_get_name(cur), cur);
tensors_by_name.emplace_back(ggml_get_name(cur), cur);
}
}
}
}
// load tensor data
// load tensor data
for (auto & it : ctx_bufs) {
for (auto & [ctx, buf_map] : ctx_buf_maps) {
ggml_context * ctx = it.first;
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
auto & bufs = it.second;
if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
return false;
return false;
}
}
}
}
...
@@ -6232,8 +6343,8 @@ size_t llama_model::n_devices() const {
...
@@ -6232,8 +6343,8 @@ size_t llama_model::n_devices() const {
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const
ggml_backend_buffer_ptr & buf_ptr
: pimpl->bufs) {
for (const
auto & [_, buf]
: pimpl->
ctxs_
bufs) {
ret[ggml_backend_buffer_get_type(buf
_ptr
.get())] += ggml_backend_buffer_get_size(buf
_ptr
.get());
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
}
}
return ret;
return ret;
}
}
...
@@ -6396,6 +6507,19 @@ void llama_model::print_info() const {
...
@@ -6396,6 +6507,19 @@ void llama_model::print_info() const {
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
}
}
if (arch == LLM_ARCH_BAILINGMOE2) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
}
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
...
@@ -11401,8 +11525,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
...
@@ -11401,8 +11525,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
}
}
};
};
struct llm_build_gemma_embedding
_iswa
: public llm_graph_context {
struct llm_build_gemma_embedding : public llm_graph_context {
llm_build_gemma_embedding
_iswa
(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
const int64_t n_embd_head = hparams.n_embd_head_k;
ggml_tensor * cur;
ggml_tensor * cur;
...
@@ -11419,8 +11543,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context {
...
@@ -11419,8 +11543,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context {
// inp_pos - contains the positions
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
ggml_tensor * inp_pos = build_inp_pos();
// TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
auto * inp_attn = build_attn_inp_no_cache();
auto * inp_attn = build_attn_inp_kv_iswa();
ggml_tensor * inp_out_ids = build_inp_out_ids();
ggml_tensor * inp_out_ids = build_inp_out_ids();
...
@@ -17245,6 +17368,150 @@ struct llm_build_bailingmoe : public llm_graph_context {
...
@@ -17245,6 +17368,150 @@ struct llm_build_bailingmoe : public llm_graph_context {
}
}
};
};
struct llm_build_bailingmoe2 : public llm_graph_context {
llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
for (int il = 0; il < n_transformer_layers; ++il) {
ggml_tensor * inpSA = inpL;
// norm
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self_attention
{
cur = build_lora_mm(model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Kcur_normed", il);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_transformer_layers - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpSA);
cb(sa_out, "sa_out", il);
// MoE branch
cur = build_norm(sa_out,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
ggml_tensor * moe_out =
build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
true, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
{
ggml_tensor * ffn_shexp = build_ffn(cur,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(ffn_shexp, "ffn_shexp", il);
cur = ggml_add(ctx0, moe_out, ffn_shexp);
cb(cur, "ffn_out", il);
}
}
cur = ggml_add(ctx0, cur, sa_out);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
};
struct llm_build_dots1 : public llm_graph_context {
struct llm_build_dots1 : public llm_graph_context {
llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v;
...
@@ -17900,6 +18167,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
...
@@ -17900,6 +18167,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
cb(cur, "result_output", -1);
...
@@ -19580,7 +19849,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
...
@@ -19580,7 +19849,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_NEO_BERT:
case LLM_ARCH_NEO_BERT:
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_WAVTOKENIZER_DEC:
//
case LLM_ARCH_GEMMA_EMBEDDING:
// TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
case LLM_ARCH_GEMMA_EMBEDDING:
case LLM_ARCH_DREAM:
case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA:
case LLM_ARCH_LLADA:
case LLM_ARCH_LLADA_MOE:
case LLM_ARCH_LLADA_MOE:
...
@@ -19873,7 +20142,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
...
@@ -19873,7 +20142,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
} break;
} break;
case LLM_ARCH_GEMMA_EMBEDDING:
case LLM_ARCH_GEMMA_EMBEDDING:
{
{
llm = std::make_unique<llm_build_gemma_embedding
_iswa
>(*this, params);
llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
} break;
} break;
case LLM_ARCH_STARCODER2:
case LLM_ARCH_STARCODER2:
{
{
...
@@ -20045,6 +20314,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
...
@@ -20045,6 +20314,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
{
llm = std::make_unique<llm_build_bailingmoe>(*this, params);
llm = std::make_unique<llm_build_bailingmoe>(*this, params);
} break;
} break;
case LLM_ARCH_BAILINGMOE2:
{
llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
} break;
case LLM_ARCH_SEED_OSS:
case LLM_ARCH_SEED_OSS:
{
{
llm = std::make_unique<llm_build_seed_oss>(*this, params);
llm = std::make_unique<llm_build_seed_oss>(*this, params);
...
@@ -20220,6 +20493,7 @@ int32_t llama_n_head(const llama_model * model) {
...
@@ -20220,6 +20493,7 @@ int32_t llama_n_head(const llama_model * model) {
llama_rope_type llama_model_rope_type(const llama_model * model) {
llama_rope_type llama_model_rope_type(const llama_model * model) {
switch (model->arch) {
switch (model->arch) {
// these models do not use RoPE
// these models do not use RoPE
case LLM_ARCH_CLIP:
case LLM_ARCH_GPT2:
case LLM_ARCH_GPT2:
case LLM_ARCH_GPTJ:
case LLM_ARCH_GPTJ:
case LLM_ARCH_MPT:
case LLM_ARCH_MPT:
...
@@ -20311,6 +20585,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
...
@@ -20311,6 +20585,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_EXAONE:
case LLM_ARCH_EXAONE:
case LLM_ARCH_EXAONE4:
case LLM_ARCH_EXAONE4:
case LLM_ARCH_MINICPM3:
case LLM_ARCH_MINICPM3:
case LLM_ARCH_BAILINGMOE2:
case LLM_ARCH_DOTS1:
case LLM_ARCH_DOTS1:
case LLM_ARCH_HUNYUAN_MOE:
case LLM_ARCH_HUNYUAN_MOE:
case LLM_ARCH_OPENAI_MOE:
case LLM_ARCH_OPENAI_MOE:
...
...
llama/llama.cpp/src/llama-model.h
View file @
544b6739
...
@@ -108,9 +108,12 @@ enum llm_type {
...
@@ -108,9 +108,12 @@ enum llm_type {
LLM_TYPE_17B_16E
,
// llama4 Scout
LLM_TYPE_17B_16E
,
// llama4 Scout
LLM_TYPE_17B_128E
,
// llama4 Maverick
LLM_TYPE_17B_128E
,
// llama4 Maverick
LLM_TYPE_A13B
,
LLM_TYPE_A13B
,
LLM_TYPE_7B_A1B
,
LLM_TYPE_8B_A1B
,
// lfm2moe
LLM_TYPE_8B_A1B
,
// lfm2moe
LLM_TYPE_16B_A1B
,
LLM_TYPE_21B_A3B
,
// Ernie MoE small
LLM_TYPE_21B_A3B
,
// Ernie MoE small
LLM_TYPE_30B_A3B
,
LLM_TYPE_30B_A3B
,
LLM_TYPE_100B_A6B
,
LLM_TYPE_106B_A12B
,
// GLM-4.5-Air
LLM_TYPE_106B_A12B
,
// GLM-4.5-Air
LLM_TYPE_235B_A22B
,
LLM_TYPE_235B_A22B
,
LLM_TYPE_300B_A47B
,
// Ernie MoE big
LLM_TYPE_300B_A47B
,
// Ernie MoE big
...
...
llama/llama.cpp/src/llama-quant.cpp
View file @
544b6739
...
@@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
...
@@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
});
});
}
}
bool
is_clip_model
=
false
;
for
(
const
auto
*
it
:
tensors
)
{
for
(
const
auto
*
it
:
tensors
)
{
const
struct
ggml_tensor
*
tensor
=
it
->
tensor
;
const
struct
ggml_tensor
*
tensor
=
it
->
tensor
;
...
@@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
...
@@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
}
else
if
(
name
==
LLM_TN
(
model
.
arch
)(
LLM_TENSOR_OUTPUT
,
"weight"
))
{
}
else
if
(
name
==
LLM_TN
(
model
.
arch
)(
LLM_TENSOR_OUTPUT
,
"weight"
))
{
qs
.
has_output
=
true
;
qs
.
has_output
=
true
;
}
}
is_clip_model
|=
name
.
rfind
(
"mm."
,
0
)
==
0
;
// check the "mm." prefix
}
}
qs
.
n_ffn_down
=
qs
.
n_ffn_gate
=
qs
.
n_ffn_up
=
(
int
)
model
.
hparams
.
n_layer
;
qs
.
n_ffn_down
=
qs
.
n_ffn_gate
=
qs
.
n_ffn_up
=
(
int
)
model
.
hparams
.
n_layer
;
// sanity checks for models that have attention layers
// sanity checks for models that have attention layers
if
(
qs
.
n_attention_wv
!=
0
)
if
(
qs
.
n_attention_wv
!=
0
&&
!
is_clip_model
)
{
{
const
auto
&
n_head_kv_iter
=
model
.
hparams
.
n_head_kv_arr
.
begin
();
const
auto
&
n_head_kv_iter
=
model
.
hparams
.
n_head_kv_arr
.
begin
();
// attention layers have a non-zero number of kv heads
// attention layers have a non-zero number of kv heads
...
@@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
...
@@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// do not quantize relative position bias (T5)
// do not quantize relative position bias (T5)
quantize
&=
name
.
find
(
"attn_rel_b.weight"
)
==
std
::
string
::
npos
;
quantize
&=
name
.
find
(
"attn_rel_b.weight"
)
==
std
::
string
::
npos
;
// do not quantize specific multimodal tensors
quantize
&=
name
.
find
(
".position_embd."
)
==
std
::
string
::
npos
;
ggml_type
new_type
;
ggml_type
new_type
;
void
*
new_data
;
void
*
new_data
;
size_t
new_size
;
size_t
new_size
;
...
...
llama/llama.cpp/src/llama-vocab.cpp
View file @
544b6739
...
@@ -1957,6 +1957,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
@@ -1957,6 +1957,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
clean_spaces
=
false
;
clean_spaces
=
false
;
}
else
if
(
}
else
if
(
tokenizer_pre
==
"bailingmoe"
||
tokenizer_pre
==
"bailingmoe"
||
tokenizer_pre
==
"bailingmoe2"
||
tokenizer_pre
==
"llada-moe"
)
{
tokenizer_pre
==
"llada-moe"
)
{
pre_type
=
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE
;
pre_type
=
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE
;
clean_spaces
=
false
;
clean_spaces
=
false
;
...
...
llama/llama.cpp/src/llama.cpp
View file @
544b6739
...
@@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
...
@@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
}
catch
(
const
std
::
exception
&
e
)
{
}
catch
(
const
std
::
exception
&
e
)
{
throw
std
::
runtime_error
(
"error loading model hyperparameters: "
+
std
::
string
(
e
.
what
()));
throw
std
::
runtime_error
(
"error loading model hyperparameters: "
+
std
::
string
(
e
.
what
()));
}
}
if
(
model
.
arch
==
LLM_ARCH_CLIP
)
{
throw
std
::
runtime_error
(
"CLIP cannot be used as main model, use it with --mmproj instead"
);
}
try
{
try
{
model
.
load_vocab
(
ml
);
model
.
load_vocab
(
ml
);
}
catch
(
const
std
::
exception
&
e
)
{
}
catch
(
const
std
::
exception
&
e
)
{
...
@@ -314,6 +317,7 @@ struct llama_model * llama_model_load_from_splits(
...
@@ -314,6 +317,7 @@ struct llama_model * llama_model_load_from_splits(
LLAMA_LOG_ERROR
(
"%s: list of splits is empty
\n
"
,
__func__
);
LLAMA_LOG_ERROR
(
"%s: list of splits is empty
\n
"
,
__func__
);
return
nullptr
;
return
nullptr
;
}
}
splits
.
reserve
(
n_paths
);
for
(
size_t
i
=
0
;
i
<
n_paths
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
n_paths
;
++
i
)
{
splits
.
push_back
(
paths
[
i
]);
splits
.
push_back
(
paths
[
i
]);
}
}
...
...
llama/llama.cpp/tools/mtmd/clip-impl.h
View file @
544b6739
...
@@ -30,6 +30,7 @@
...
@@ -30,6 +30,7 @@
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
// vision-specific
// vision-specific
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
#define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
...
@@ -48,6 +49,7 @@
...
@@ -48,6 +49,7 @@
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
// audio-specific
// audio-specific
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
...
...
llama/llama.cpp/tools/mtmd/clip.cpp
View file @
544b6739
...
@@ -2234,15 +2234,27 @@ struct clip_model_loader {
...
@@ -2234,15 +2234,27 @@ struct clip_model_loader {
// projector type
// projector type
std
::
string
proj_type
;
std
::
string
proj_type
;
{
{
// default key
get_string
(
KEY_PROJ_TYPE
,
proj_type
,
false
);
get_string
(
KEY_PROJ_TYPE
,
proj_type
,
false
);
if
(
!
proj_type
.
empty
())
{
model
.
proj_type
=
clip_projector_type_from_string
(
proj_type
);
// for models with mixed modalities
if
(
proj_type
.
empty
())
{
if
(
modality
==
CLIP_MODALITY_VISION
)
{
get_string
(
KEY_VISION_PROJ_TYPE
,
proj_type
,
false
);
}
else
if
(
modality
==
CLIP_MODALITY_AUDIO
)
{
get_string
(
KEY_AUDIO_PROJ_TYPE
,
proj_type
,
false
);
}
else
{
GGML_ABORT
(
"unknown modality"
);
}
}
}
model
.
proj_type
=
clip_projector_type_from_string
(
proj_type
);
if
(
model
.
proj_type
==
PROJECTOR_TYPE_UNKNOWN
)
{
if
(
model
.
proj_type
==
PROJECTOR_TYPE_UNKNOWN
)
{
throw
std
::
runtime_error
(
string_format
(
"%s: unknown projector type: %s
\n
"
,
__func__
,
proj_type
.
c_str
()));
throw
std
::
runtime_error
(
string_format
(
"%s: unknown projector type: %s
\n
"
,
__func__
,
proj_type
.
c_str
()));
}
}
// correct arch for multimodal models
// correct arch for multimodal models
(legacy method)
if
(
model
.
proj_type
==
PROJECTOR_TYPE_QWEN25O
)
{
if
(
model
.
proj_type
==
PROJECTOR_TYPE_QWEN25O
)
{
model
.
proj_type
=
modality
==
CLIP_MODALITY_VISION
model
.
proj_type
=
modality
==
CLIP_MODALITY_VISION
?
PROJECTOR_TYPE_QWEN25VL
?
PROJECTOR_TYPE_QWEN25VL
...
...
llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
View file @
544b6739
...
@@ -23,7 +23,7 @@ problem.
...
@@ -23,7 +23,7 @@ problem.
8 files changed, 21 insertions(+), 2 deletions(-)
8 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index ff9135fe..8ba86f82 100644
index ff9135fe
2
..8ba86f82
4
100644
--- a/ggml/src/ggml-backend.cpp
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -113,7 +113,6 @@
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -113,7 +113,6 @@
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
...
@@ -64,18 +64,18 @@ index ff9135fe..8ba86f82 100644
...
@@ -64,18 +64,18 @@ index ff9135fe..8ba86f82 100644
/* .init_tensor = */ NULL, // no initialization required
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index
ad1adba6..7d44f74f
100
755
index
8bd5449f1..01e2df61a
100
644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -8
43
,6 +8
43
,7 @@
static
void
ggml_backend_
cann
_buffer_
free_
buffer
(
@@ -8
20
,6 +8
20
,7 @@
static
bool
ggml_backend_
buffer_is_cann(ggml_backend
_buffer_
t
buffer
) {
ggml_backend_cann_buffer_
context* ctx =
static void
ggml_backend_cann_buffer_
free_buffer(ggml_backend_buffer_t buffer) {
(ggml_backend_cann_buffer_context*)buffer->context;
ggml_backend_cann_buffer_context * ctx =
(ggml_backend_cann_buffer_context
*)
buffer->context;
delete ctx;
delete ctx;
+ delete buffer;
+ delete buffer;
}
}
/**
/**
@@ -16
3
0,6 +16
3
1,7 @@
static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
@@ -1
5
60,6 +1
5
61,7 @@
static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
*/
*/
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
ACL_CHECK(aclrtFreeHost(buffer->context));
ACL_CHECK(aclrtFreeHost(buffer->context));
...
@@ -84,10 +84,10 @@ index ad1adba6..7d44f74f 100755
...
@@ -84,10 +84,10 @@ index ad1adba6..7d44f74f 100755
/**
/**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
856e9de2..c0b1e4c1
100644
index
bc396b521..aefc6935e
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -5
6
7,6 +5
6
7,7 @@
struct ggml_backend_cuda_buffer_context {
@@ -57
6
,6 +57
6
,7 @@
struct ggml_backend_cuda_buffer_context {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
delete ctx;
delete ctx;
...
@@ -95,7 +95,7 @@ index 856e9de2..c0b1e4c1 100644
...
@@ -95,7 +95,7 @@ index 856e9de2..c0b1e4c1 100644
}
}
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
@@ -8
22
,6 +8
2
3,7 @@
struct ggml_backend_cuda_split_buffer_context {
@@ -8
31
,6 +83
2
,7 @@
struct ggml_backend_cuda_split_buffer_context {
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
delete ctx;
delete ctx;
...
@@ -103,7 +103,7 @@ index 856e9de2..c0b1e4c1 100644
...
@@ -103,7 +103,7 @@ index 856e9de2..c0b1e4c1 100644
}
}
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -11
03
,6 +11
05
,7 @@
static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
@@ -11
12
,6 +11
14
,7 @@
static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
CUDA_CHECK(cudaFreeHost(buffer->context));
CUDA_CHECK(cudaFreeHost(buffer->context));
...
@@ -112,7 +112,7 @@ index 856e9de2..c0b1e4c1 100644
...
@@ -112,7 +112,7 @@ index 856e9de2..c0b1e4c1 100644
static void * ggml_cuda_host_malloc(size_t size) {
static void * ggml_cuda_host_malloc(size_t size) {
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 7afc881f..bf096227 100644
index 7afc881f
a
..bf096227
4
100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -25,6 +25,7 @@
static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
@@ -25,6 +25,7 @@
static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
...
@@ -132,10 +132,10 @@ index 7afc881f..bf096227 100644
...
@@ -132,10 +132,10 @@ index 7afc881f..bf096227 100644
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index
79d21487..38c75018
100644
index
db33a4ab6..c42ee26e1
100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -32
12
,6 +32
12
,7 @@
struct ggml_backend_opencl_buffer_context {
@@ -32
66
,6 +32
66
,7 @@
struct ggml_backend_opencl_buffer_context {
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
delete ctx;
delete ctx;
...
@@ -144,7 +144,7 @@ index 79d21487..38c75018 100644
...
@@ -144,7 +144,7 @@ index 79d21487..38c75018 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index a
ad48d62..a46c0f52
100644
index a
38df5a97..fd07e4a21
100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -528,6 +528,7 @@
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -528,6 +528,7 @@
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
...
@@ -156,10 +156,10 @@ index aad48d62..a46c0f52 100644
...
@@ -156,10 +156,10 @@ index aad48d62..a46c0f52 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index
45b8c216..4ec9a592
100644
index
b695ba051..37e853120
100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3
34
,6 +3
34
,7 @@
ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -3
52
,6 +3
52
,7 @@
ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
ggml_sycl_set_device(ctx->device);
ggml_sycl_set_device(ctx->device);
delete ctx;
delete ctx;
...
@@ -167,7 +167,7 @@ index 45b8c216..4ec9a592 100644
...
@@ -167,7 +167,7 @@ index 45b8c216..4ec9a592 100644
}
}
catch (sycl::exception const &exc) {
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -
795
,6 +
796
,7 @@
struct ggml_backend_sycl_split_buffer_context {
@@ -
813
,6 +
814
,7 @@
struct ggml_backend_sycl_split_buffer_context {
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
delete ctx;
delete ctx;
...
@@ -175,7 +175,7 @@ index 45b8c216..4ec9a592 100644
...
@@ -175,7 +175,7 @@ index 45b8c216..4ec9a592 100644
}
}
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -11
37
,6 +11
39
,7 @@
static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
@@ -11
55
,6 +11
57
,7 @@
static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_sycl_host_free(buffer->context);
ggml_sycl_host_free(buffer->context);
...
@@ -184,10 +184,10 @@ index 45b8c216..4ec9a592 100644
...
@@ -184,10 +184,10 @@ index 45b8c216..4ec9a592 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index
3cd89c71..ed83236f
100644
index
b783f7805..216dc167c
100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -11
600
,6 +11
600
,7 @@
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -11
828
,6 +11
828
,7 @@
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer);
ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx;
delete ctx;
...
@@ -195,7 +195,7 @@ index 3cd89c71..ed83236f 100644
...
@@ -195,7 +195,7 @@ index 3cd89c71..ed83236f 100644
}
}
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -11
743
,6 +11
744
,7 @@
static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
@@ -11
971
,6 +11
972
,7 @@
static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment