Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
e9e5f61c
Unverified
Commit
e9e5f61c
authored
Apr 25, 2025
by
Jeffrey Morgan
Committed by
GitHub
Apr 24, 2025
Browse files
llama: update to commit 2016f07b (#10352)
parent
11dde418
Changes
46
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
798 additions
and
720 deletions
+798
-720
Makefile.sync
Makefile.sync
+1
-1
llama/build-info.cpp
llama/build-info.cpp
+1
-1
llama/llama.cpp/examples/llava/clip-impl.h
llama/llama.cpp/examples/llava/clip-impl.h
+0
-3
llama/llama.cpp/examples/llava/clip.cpp
llama/llama.cpp/examples/llava/clip.cpp
+456
-450
llama/llama.cpp/src/llama-arch.cpp
llama/llama.cpp/src/llama-arch.cpp
+6
-17
llama/llama.cpp/src/llama-arch.h
llama/llama.cpp/src/llama-arch.h
+4
-0
llama/llama.cpp/src/llama-context.cpp
llama/llama.cpp/src/llama-context.cpp
+5
-1
llama/llama.cpp/src/llama-graph.cpp
llama/llama.cpp/src/llama-graph.cpp
+20
-9
llama/llama.cpp/src/llama-graph.h
llama/llama.cpp/src/llama-graph.h
+7
-3
llama/llama.cpp/src/llama-hparams.h
llama/llama.cpp/src/llama-hparams.h
+4
-0
llama/llama.cpp/src/llama-kv-cache.cpp
llama/llama.cpp/src/llama-kv-cache.cpp
+1
-1
llama/llama.cpp/src/llama-model.cpp
llama/llama.cpp/src/llama-model.cpp
+190
-138
llama/llama.cpp/src/llama-model.h
llama/llama.cpp/src/llama-model.h
+2
-0
llama/llama.cpp/src/llama-vocab.cpp
llama/llama.cpp/src/llama-vocab.cpp
+5
-0
llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
...gml-backend-malloc-and-free-using-the-same-compiler.patch
+17
-17
llama/patches/0002-pretokenizer.patch
llama/patches/0002-pretokenizer.patch
+1
-1
llama/patches/0003-embeddings.patch
llama/patches/0003-embeddings.patch
+4
-4
llama/patches/0004-clip-unicode.patch
llama/patches/0004-clip-unicode.patch
+5
-5
llama/patches/0005-solar-pro.patch
llama/patches/0005-solar-pro.patch
+23
-23
llama/patches/0006-add-mllama-support.patch
llama/patches/0006-add-mllama-support.patch
+46
-46
No files found.
Makefile.sync
View file @
e9e5f61c
UPSTREAM
=
https://github.com/ggerganov/llama.cpp.git
UPSTREAM
=
https://github.com/ggerganov/llama.cpp.git
WORKDIR
=
llama/vendor
WORKDIR
=
llama/vendor
FETCH_HEAD
=
71e90e8813f90097701e62f7fce137d96ddf41e2
FETCH_HEAD
=
2016f07bd106c73699ecbaace80f55db5ed95dac
.PHONY
:
help
.PHONY
:
help
help
:
help
:
...
...
llama/build-info.cpp
View file @
e9e5f61c
int
LLAMA_BUILD_NUMBER
=
0
;
int
LLAMA_BUILD_NUMBER
=
0
;
char
const
*
LLAMA_COMMIT
=
"
71e90e8813f90097701e62f7fce137d96ddf41e2
"
;
char
const
*
LLAMA_COMMIT
=
"
2016f07bd106c73699ecbaace80f55db5ed95dac
"
;
char
const
*
LLAMA_COMPILER
=
""
;
char
const
*
LLAMA_COMPILER
=
""
;
char
const
*
LLAMA_BUILD_TARGET
=
""
;
char
const
*
LLAMA_BUILD_TARGET
=
""
;
llama/llama.cpp/examples/llava/clip-impl.h
View file @
e9e5f61c
...
@@ -50,7 +50,6 @@
...
@@ -50,7 +50,6 @@
// tensor name constants
// tensor name constants
//
//
#define TN_TOKEN_EMBD "%s.token_embd.weight"
#define TN_POS_EMBD "%s.position_embd.weight"
#define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd"
#define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
...
@@ -66,8 +65,6 @@
...
@@ -66,8 +65,6 @@
#define TN_LN_2 "%s.blk.%d.ln2.%s"
#define TN_LN_2 "%s.blk.%d.ln2.%s"
#define TN_LN_PRE "%s.pre_ln.%s"
#define TN_LN_PRE "%s.pre_ln.%s"
#define TN_LN_POST "%s.post_ln.%s"
#define TN_LN_POST "%s.post_ln.%s"
#define TN_TEXT_PROJ "text_projection.weight"
#define TN_VIS_PROJ "visual_projection.weight"
#define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
...
...
llama/llama.cpp/examples/llava/clip.cpp
View file @
e9e5f61c
This diff is collapsed.
Click to expand it.
llama/llama.cpp/src/llama-arch.cpp
View file @
e9e5f61c
...
@@ -145,6 +145,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
...
@@ -145,6 +145,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{
LLM_KV_ATTENTION_SCALE
,
"%s.attention.scale"
},
{
LLM_KV_ATTENTION_SCALE
,
"%s.attention.scale"
},
{
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION
,
"%s.attention.block_skip_connection"
},
{
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION
,
"%s.attention.block_skip_connection"
},
{
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS
,
"%s.attention.cross_attention_layers"
},
{
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS
,
"%s.attention.cross_attention_layers"
},
{
LLM_KV_ATTENTION_KEY_LENGTH_MLA
,
"%s.attention.key_length_mla"
},
{
LLM_KV_ATTENTION_VALUE_LENGTH_MLA
,
"%s.attention.value_length_mla"
},
{
LLM_KV_ROPE_DIMENSION_COUNT
,
"%s.rope.dimension_count"
},
{
LLM_KV_ROPE_DIMENSION_COUNT
,
"%s.rope.dimension_count"
},
{
LLM_KV_ROPE_DIMENSION_SECTIONS
,
"%s.rope.dimension_sections"
},
{
LLM_KV_ROPE_DIMENSION_SECTIONS
,
"%s.rope.dimension_sections"
},
...
@@ -1142,6 +1144,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
...
@@ -1142,6 +1144,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{
LLM_TENSOR_ATTN_Q_B
,
"blk.%d.attn_q_b"
},
{
LLM_TENSOR_ATTN_Q_B
,
"blk.%d.attn_q_b"
},
{
LLM_TENSOR_ATTN_KV_A_MQA
,
"blk.%d.attn_kv_a_mqa"
},
{
LLM_TENSOR_ATTN_KV_A_MQA
,
"blk.%d.attn_kv_a_mqa"
},
{
LLM_TENSOR_ATTN_KV_B
,
"blk.%d.attn_kv_b"
},
{
LLM_TENSOR_ATTN_KV_B
,
"blk.%d.attn_kv_b"
},
{
LLM_TENSOR_ATTN_K_B
,
"blk.%d.attn_k_b"
},
{
LLM_TENSOR_ATTN_V_B
,
"blk.%d.attn_v_b"
},
{
LLM_TENSOR_ATTN_OUT
,
"blk.%d.attn_output"
},
{
LLM_TENSOR_ATTN_OUT
,
"blk.%d.attn_output"
},
{
LLM_TENSOR_FFN_NORM
,
"blk.%d.ffn_norm"
},
{
LLM_TENSOR_FFN_NORM
,
"blk.%d.ffn_norm"
},
{
LLM_TENSOR_FFN_GATE
,
"blk.%d.ffn_gate"
},
{
LLM_TENSOR_FFN_GATE
,
"blk.%d.ffn_gate"
},
...
@@ -1636,23 +1640,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
...
@@ -1636,23 +1640,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{
LLM_TENSOR_ATTN_Q_B
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_Q_B
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_KV_A_MQA
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_KV_A_MQA
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_KV_B
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_KV_B
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_DEC_ATTN_Q
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_K_B
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_DEC_ATTN_K
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_V_B
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_Q
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_K
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_V
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_QKV
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_OUT
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_FFN_GATE
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_FFN_DOWN
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_FFN_UP
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_FFN_DOWN_SHEXP
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_FFN_GATE_SHEXP
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_FFN_UP_SHEXP
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_Q_A
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_Q_B
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_KV_A_MQA
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_ATTN_KV_B
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_DEC_ATTN_Q
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_DEC_ATTN_Q
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_DEC_ATTN_K
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_DEC_ATTN_K
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_DEC_ATTN_V
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_DEC_ATTN_V
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
...
...
llama/llama.cpp/src/llama-arch.h
View file @
e9e5f61c
...
@@ -149,6 +149,8 @@ enum llm_kv {
...
@@ -149,6 +149,8 @@ enum llm_kv {
LLM_KV_ATTENTION_SCALE
,
LLM_KV_ATTENTION_SCALE
,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION
,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION
,
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS
,
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS
,
LLM_KV_ATTENTION_KEY_LENGTH_MLA
,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA
,
LLM_KV_ROPE_DIMENSION_COUNT
,
LLM_KV_ROPE_DIMENSION_COUNT
,
LLM_KV_ROPE_DIMENSION_SECTIONS
,
LLM_KV_ROPE_DIMENSION_SECTIONS
,
...
@@ -311,6 +313,8 @@ enum llm_tensor {
...
@@ -311,6 +313,8 @@ enum llm_tensor {
LLM_TENSOR_ATTN_Q_B
,
LLM_TENSOR_ATTN_Q_B
,
LLM_TENSOR_ATTN_KV_A_MQA
,
LLM_TENSOR_ATTN_KV_A_MQA
,
LLM_TENSOR_ATTN_KV_B
,
LLM_TENSOR_ATTN_KV_B
,
LLM_TENSOR_ATTN_K_B
,
LLM_TENSOR_ATTN_V_B
,
LLM_TENSOR_ATTN_Q_A_NORM
,
LLM_TENSOR_ATTN_Q_A_NORM
,
LLM_TENSOR_ATTN_KV_A_NORM
,
LLM_TENSOR_ATTN_KV_A_NORM
,
LLM_TENSOR_ATTN_SUB_NORM
,
LLM_TENSOR_ATTN_SUB_NORM
,
...
...
llama/llama.cpp/src/llama-context.cpp
View file @
e9e5f61c
...
@@ -10,6 +10,7 @@
...
@@ -10,6 +10,7 @@
#include <cstring>
#include <cstring>
#include <stdexcept>
#include <stdexcept>
#include <cinttypes>
#include <cinttypes>
#include <cmath>
//
//
// llama_context
// llama_context
...
@@ -473,7 +474,6 @@ ggml_tensor * llama_context::build_rope_shift(
...
@@ -473,7 +474,6 @@ ggml_tensor * llama_context::build_rope_shift(
const
auto
&
n_ctx_orig
=
cparams
.
n_ctx_orig_yarn
;
const
auto
&
n_ctx_orig
=
cparams
.
n_ctx_orig_yarn
;
const
auto
&
yarn_ext_factor
=
cparams
.
yarn_ext_factor
;
const
auto
&
yarn_ext_factor
=
cparams
.
yarn_ext_factor
;
const
auto
&
yarn_attn_factor
=
cparams
.
yarn_attn_factor
;
const
auto
&
yarn_beta_fast
=
cparams
.
yarn_beta_fast
;
const
auto
&
yarn_beta_fast
=
cparams
.
yarn_beta_fast
;
const
auto
&
yarn_beta_slow
=
cparams
.
yarn_beta_slow
;
const
auto
&
yarn_beta_slow
=
cparams
.
yarn_beta_slow
;
...
@@ -482,6 +482,10 @@ ggml_tensor * llama_context::build_rope_shift(
...
@@ -482,6 +482,10 @@ ggml_tensor * llama_context::build_rope_shift(
const
auto
&
n_rot
=
hparams
.
n_rot
;
const
auto
&
n_rot
=
hparams
.
n_rot
;
const
auto
&
rope_type
=
hparams
.
rope_type
;
const
auto
&
rope_type
=
hparams
.
rope_type
;
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const
float
yarn_attn_factor
=
model
.
arch
==
LLM_ARCH_DEEPSEEK2
?
1.0
f
/
(
1.0
f
+
0.1
f
*
logf
(
1.0
f
/
freq_scale
))
:
cparams
.
yarn_attn_factor
;
ggml_tensor
*
tmp
;
ggml_tensor
*
tmp
;
if
(
ggml_is_quantized
(
cur
->
type
))
{
if
(
ggml_is_quantized
(
cur
->
type
))
{
...
...
llama/llama.cpp/src/llama-graph.cpp
View file @
e9e5f61c
...
@@ -1194,6 +1194,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
...
@@ -1194,6 +1194,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
ggml_tensor
*
v
,
ggml_tensor
*
v
,
ggml_tensor
*
kq_b
,
ggml_tensor
*
kq_b
,
ggml_tensor
*
kq_mask
,
ggml_tensor
*
kq_mask
,
ggml_tensor
*
v_mla
,
bool
v_trans
,
bool
v_trans
,
float
kq_scale
)
const
{
float
kq_scale
)
const
{
//const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
//const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
...
@@ -1205,8 +1206,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
...
@@ -1205,8 +1206,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
//const auto & n_embd_head_k = hparams.n_embd_head_k;
//const auto & n_embd_head_k = hparams.n_embd_head_k;
//const auto & n_embd_head_v = hparams.n_embd_head_v;
//const auto & n_embd_head_v = hparams.n_embd_head_v;
const
auto
n_embd_head_v
=
v_trans
?
v
->
ne
[
1
]
:
v
->
ne
[
0
];
const
auto
n_tokens
=
q
->
ne
[
1
];
const
auto
n_tokens
=
q
->
ne
[
1
];
const
auto
n_head
=
q
->
ne
[
2
];
const
auto
n_head
=
q
->
ne
[
2
];
const
auto
n_kv
=
k
->
ne
[
1
];
const
auto
n_kv
=
k
->
ne
[
1
];
...
@@ -1235,7 +1234,12 @@ ggml_tensor * llm_graph_context::build_attn_mha(
...
@@ -1235,7 +1234,12 @@ ggml_tensor * llm_graph_context::build_attn_mha(
ggml_flash_attn_ext_set_prec
(
cur
,
GGML_PREC_F32
);
ggml_flash_attn_ext_set_prec
(
cur
,
GGML_PREC_F32
);
cur
=
ggml_reshape_2d
(
ctx0
,
cur
,
n_embd_head_v
*
n_head
,
n_tokens
);
if
(
v_mla
)
{
cur
=
ggml_reshape_4d
(
ctx0
,
cur
,
v_mla
->
ne
[
0
],
1
,
n_head
,
n_tokens
);
cur
=
ggml_mul_mat
(
ctx0
,
v_mla
,
cur
);
}
cur
=
ggml_reshape_2d
(
ctx0
,
cur
,
cur
->
ne
[
0
]
*
n_head
,
n_tokens
);
}
else
{
}
else
{
ggml_tensor
*
kq
=
ggml_mul_mat
(
ctx0
,
k
,
q
);
ggml_tensor
*
kq
=
ggml_mul_mat
(
ctx0
,
k
,
q
);
...
@@ -1273,9 +1277,14 @@ ggml_tensor * llm_graph_context::build_attn_mha(
...
@@ -1273,9 +1277,14 @@ ggml_tensor * llm_graph_context::build_attn_mha(
ggml_tensor
*
kqv
=
ggml_mul_mat
(
ctx0
,
v
,
kq
);
ggml_tensor
*
kqv
=
ggml_mul_mat
(
ctx0
,
v
,
kq
);
ggml_tensor
*
kqv_merged
=
ggml_permute
(
ctx0
,
kqv
,
0
,
2
,
1
,
3
);
// for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
if
(
v_mla
)
{
kqv
=
ggml_mul_mat
(
ctx0
,
v_mla
,
kqv
);
}
cur
=
ggml_permute
(
ctx0
,
kqv
,
0
,
2
,
1
,
3
);
cur
=
ggml_cont_2d
(
ctx0
,
kqv_merged
,
n_embd_head_v
*
n_head
,
n_tokens
);
cur
=
ggml_cont_2d
(
ctx0
,
cur
,
cur
->
ne
[
0
]
*
n_head
,
n_tokens
);
if
(
!
cparams
.
offload_kqv
)
{
if
(
!
cparams
.
offload_kqv
)
{
// all nodes between the KV store and the attention output are run on the CPU
// all nodes between the KV store and the attention output are run on the CPU
...
@@ -1310,6 +1319,7 @@ ggml_tensor * llm_graph_context::build_attn(
...
@@ -1310,6 +1319,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor
*
k_cur
,
ggml_tensor
*
k_cur
,
ggml_tensor
*
v_cur
,
ggml_tensor
*
v_cur
,
ggml_tensor
*
kq_b
,
ggml_tensor
*
kq_b
,
ggml_tensor
*
v_mla
,
float
kq_scale
,
float
kq_scale
,
int
il
)
const
{
int
il
)
const
{
GGML_UNUSED
(
n_tokens
);
GGML_UNUSED
(
n_tokens
);
...
@@ -1331,7 +1341,7 @@ ggml_tensor * llm_graph_context::build_attn(
...
@@ -1331,7 +1341,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor
*
v
=
ggml_permute
(
ctx0
,
v_cur
,
0
,
2
,
1
,
3
);
ggml_tensor
*
v
=
ggml_permute
(
ctx0
,
v_cur
,
0
,
2
,
1
,
3
);
//cb(k, "v", il);
//cb(k, "v", il);
ggml_tensor
*
cur
=
build_attn_mha
(
gf
,
q
,
k
,
v
,
kq_b
,
kq_mask
,
false
,
kq_scale
);
ggml_tensor
*
cur
=
build_attn_mha
(
gf
,
q
,
k
,
v
,
kq_b
,
kq_mask
,
v_mla
,
false
,
kq_scale
);
cb
(
cur
,
"kqv_out"
,
il
);
cb
(
cur
,
"kqv_out"
,
il
);
...
@@ -1385,6 +1395,7 @@ ggml_tensor * llm_graph_context::build_attn(
...
@@ -1385,6 +1395,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor
*
k_cur
,
ggml_tensor
*
k_cur
,
ggml_tensor
*
v_cur
,
ggml_tensor
*
v_cur
,
ggml_tensor
*
kq_b
,
ggml_tensor
*
kq_b
,
ggml_tensor
*
v_mla
,
float
kq_scale
,
float
kq_scale
,
int
il
)
const
{
int
il
)
const
{
// these nodes are added to the graph together so that they are not reordered
// these nodes are added to the graph together so that they are not reordered
...
@@ -1470,7 +1481,7 @@ ggml_tensor * llm_graph_context::build_attn(
...
@@ -1470,7 +1481,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_element_size
(
kv_self
->
v_l
[
il
])
*
n_ctx
*
n_embd_head_v
,
ggml_element_size
(
kv_self
->
v_l
[
il
])
*
n_ctx
*
n_embd_head_v
,
0
);
0
);
ggml_tensor
*
cur
=
build_attn_mha
(
gf
,
q
,
k
,
v
,
kq_b
,
kq_mask
,
v_trans
,
kq_scale
);
ggml_tensor
*
cur
=
build_attn_mha
(
gf
,
q
,
k
,
v
,
kq_b
,
kq_mask
,
v_mla
,
v_trans
,
kq_scale
);
cb
(
cur
,
"kqv_out"
,
il
);
cb
(
cur
,
"kqv_out"
,
il
);
if
(
wo
)
{
if
(
wo
)
{
...
@@ -1529,6 +1540,7 @@ ggml_tensor * llm_graph_context::build_attn(
...
@@ -1529,6 +1540,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor
*
k_cur
,
ggml_tensor
*
k_cur
,
ggml_tensor
*
v_cur
,
ggml_tensor
*
v_cur
,
ggml_tensor
*
kq_b
,
ggml_tensor
*
kq_b
,
ggml_tensor
*
v_mla
,
float
kq_scale
,
float
kq_scale
,
int
il
)
const
{
int
il
)
const
{
// these nodes are added to the graph together so that they are not reordered
// these nodes are added to the graph together so that they are not reordered
...
@@ -1548,7 +1560,7 @@ ggml_tensor * llm_graph_context::build_attn(
...
@@ -1548,7 +1560,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor
*
v
=
ggml_permute
(
ctx0
,
v_cur
,
0
,
2
,
1
,
3
);
ggml_tensor
*
v
=
ggml_permute
(
ctx0
,
v_cur
,
0
,
2
,
1
,
3
);
//cb(k, "v", il);
//cb(k, "v", il);
ggml_tensor
*
cur
=
build_attn_mha
(
gf
,
q
,
k
,
v
,
kq_b
,
kq_mask
,
false
,
kq_scale
);
ggml_tensor
*
cur
=
build_attn_mha
(
gf
,
q
,
k
,
v
,
kq_b
,
kq_mask
,
v_mla
,
false
,
kq_scale
);
cb
(
cur
,
"kqv_out"
,
il
);
cb
(
cur
,
"kqv_out"
,
il
);
...
@@ -1717,4 +1729,3 @@ void llm_graph_context::build_pooling(
...
@@ -1717,4 +1729,3 @@ void llm_graph_context::build_pooling(
ggml_build_forward_expand
(
gf
,
cur
);
ggml_build_forward_expand
(
gf
,
cur
);
}
}
llama/llama.cpp/src/llama-graph.h
View file @
e9e5f61c
...
@@ -522,6 +522,7 @@ struct llm_graph_context {
...
@@ -522,6 +522,7 @@ struct llm_graph_context {
ggml_tensor
*
v
,
// [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
ggml_tensor
*
v
,
// [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
ggml_tensor
*
kq_b
,
ggml_tensor
*
kq_b
,
ggml_tensor
*
kq_mask
,
ggml_tensor
*
kq_mask
,
ggml_tensor
*
v_mla
,
// [n_embd_head_v_mla, n_embd_head_v, n_head_v]
bool
v_trans
,
bool
v_trans
,
float
kq_scale
)
const
;
float
kq_scale
)
const
;
...
@@ -536,6 +537,7 @@ struct llm_graph_context {
...
@@ -536,6 +537,7 @@ struct llm_graph_context {
ggml_tensor
*
k_cur
,
// [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor
*
k_cur
,
// [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor
*
v_cur
,
// [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor
*
v_cur
,
// [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor
*
kq_b
,
ggml_tensor
*
kq_b
,
ggml_tensor
*
v_mla
,
// [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float
kq_scale
,
float
kq_scale
,
int
il
)
const
;
int
il
)
const
;
...
@@ -550,6 +552,7 @@ struct llm_graph_context {
...
@@ -550,6 +552,7 @@ struct llm_graph_context {
ggml_tensor
*
k_cur
,
// [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor
*
k_cur
,
// [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor
*
v_cur
,
// [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor
*
v_cur
,
// [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor
*
kq_b
,
ggml_tensor
*
kq_b
,
ggml_tensor
*
v_mla
,
// [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float
kq_scale
,
float
kq_scale
,
int
il
)
const
;
int
il
)
const
;
...
@@ -564,6 +567,7 @@ struct llm_graph_context {
...
@@ -564,6 +567,7 @@ struct llm_graph_context {
ggml_tensor
*
k_cur
,
// [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor
*
k_cur
,
// [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor
*
v_cur
,
// [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor
*
v_cur
,
// [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor
*
kq_b
,
ggml_tensor
*
kq_b
,
ggml_tensor
*
v_mla
,
// [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float
kq_scale
,
float
kq_scale
,
int
il
)
const
;
int
il
)
const
;
...
...
llama/llama.cpp/src/llama-hparams.h
View file @
e9e5f61c
...
@@ -46,6 +46,10 @@ struct llama_hparams {
...
@@ -46,6 +46,10 @@ struct llama_hparams {
uint32_t
n_rel_attn_bkts
=
0
;
uint32_t
n_rel_attn_bkts
=
0
;
uint32_t
n_vocab
=
0
;
uint32_t
n_vocab
=
0
;
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
uint32_t
n_embd_head_k_mla
=
0
;
uint32_t
n_embd_head_v_mla
=
0
;
// for WavTokenizer
// for WavTokenizer
struct
llama_hparams_posnet
posnet
;
struct
llama_hparams_posnet
posnet
;
struct
llama_hparams_convnext
convnext
;
struct
llama_hparams_convnext
convnext
;
...
...
llama/llama.cpp/src/llama-kv-cache.cpp
View file @
e9e5f61c
...
@@ -27,7 +27,7 @@ bool llama_kv_cache_unified::init(
...
@@ -27,7 +27,7 @@ bool llama_kv_cache_unified::init(
recurrent
=
llama_model_is_recurrent
(
&
model
);
recurrent
=
llama_model_is_recurrent
(
&
model
);
v_trans
=
!
recurrent
&&
!
cparams
.
flash_attn
;
v_trans
=
!
recurrent
&&
!
cparams
.
flash_attn
;
can_shift
=
!
recurrent
&&
model
.
arch
!=
LLM_ARCH_DEEPSEEK2
;
// not supported due to MLA
can_shift
=
!
recurrent
;
LLAMA_LOG_INFO
(
"%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d
\n
"
,
LLAMA_LOG_INFO
(
"%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d
\n
"
,
__func__
,
kv_size
,
offload
,
ggml_type_name
(
type_k
),
ggml_type_name
(
type_v
),
n_layer
,
can_shift
);
__func__
,
kv_size
,
offload
,
ggml_type_name
(
type_k
),
ggml_type_name
(
type_v
),
n_layer
,
can_shift
);
...
...
llama/llama.cpp/src/llama-model.cpp
View file @
e9e5f61c
This diff is collapsed.
Click to expand it.
llama/llama.cpp/src/llama-model.h
View file @
e9e5f61c
...
@@ -174,6 +174,8 @@ struct llama_layer {
...
@@ -174,6 +174,8 @@ struct llama_layer {
struct
ggml_tensor
*
wq_b
=
nullptr
;
struct
ggml_tensor
*
wq_b
=
nullptr
;
struct
ggml_tensor
*
wkv_a_mqa
=
nullptr
;
struct
ggml_tensor
*
wkv_a_mqa
=
nullptr
;
struct
ggml_tensor
*
wkv_b
=
nullptr
;
struct
ggml_tensor
*
wkv_b
=
nullptr
;
struct
ggml_tensor
*
wk_b
=
nullptr
;
struct
ggml_tensor
*
wv_b
=
nullptr
;
struct
ggml_tensor
*
wq_cross
=
nullptr
;
struct
ggml_tensor
*
wq_cross
=
nullptr
;
struct
ggml_tensor
*
wk_cross
=
nullptr
;
struct
ggml_tensor
*
wk_cross
=
nullptr
;
struct
ggml_tensor
*
wv_cross
=
nullptr
;
struct
ggml_tensor
*
wv_cross
=
nullptr
;
...
...
llama/llama.cpp/src/llama-vocab.cpp
View file @
e9e5f61c
...
@@ -1833,6 +1833,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
@@ -1833,6 +1833,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if
(
false
if
(
false
||
t
.
first
==
"<|fim_prefix|>"
// Qwen
||
t
.
first
==
"<|fim_prefix|>"
// Qwen
||
t
.
first
==
"<fim-prefix>"
||
t
.
first
==
"<fim-prefix>"
||
t
.
first
==
"<fim_prefix>"
// Granite
||
t
.
first
==
"<|fim▁begin|>"
// DeepSeek
||
t
.
first
==
"<|fim▁begin|>"
// DeepSeek
||
t
.
first
==
"<PRE>"
||
t
.
first
==
"<PRE>"
||
t
.
first
==
"▁<PRE>"
// CodeLlama
||
t
.
first
==
"▁<PRE>"
// CodeLlama
...
@@ -1851,6 +1852,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
@@ -1851,6 +1852,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if
(
false
if
(
false
||
t
.
first
==
"<|fim_suffix|>"
// Qwen
||
t
.
first
==
"<|fim_suffix|>"
// Qwen
||
t
.
first
==
"<fim-suffix>"
||
t
.
first
==
"<fim-suffix>"
||
t
.
first
==
"<fim_suffix>"
// Granite
||
t
.
first
==
"<|fim▁hole|>"
// DeepSeek
||
t
.
first
==
"<|fim▁hole|>"
// DeepSeek
||
t
.
first
==
"<SUF>"
||
t
.
first
==
"<SUF>"
||
t
.
first
==
"▁<SUF>"
// CodeLlama
||
t
.
first
==
"▁<SUF>"
// CodeLlama
...
@@ -1869,6 +1871,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
@@ -1869,6 +1871,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if
(
false
if
(
false
||
t
.
first
==
"<|fim_middle|>"
// Qwen
||
t
.
first
==
"<|fim_middle|>"
// Qwen
||
t
.
first
==
"<fim-middle>"
||
t
.
first
==
"<fim-middle>"
||
t
.
first
==
"<fim_middle>"
// Granite
||
t
.
first
==
"<|fim▁end|>"
// DeepSeek
||
t
.
first
==
"<|fim▁end|>"
// DeepSeek
||
t
.
first
==
"<MID>"
||
t
.
first
==
"<MID>"
||
t
.
first
==
"▁<MID>"
// CodeLlama
||
t
.
first
==
"▁<MID>"
// CodeLlama
...
@@ -1887,6 +1890,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
@@ -1887,6 +1890,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if
(
false
if
(
false
||
t
.
first
==
"<|fim_pad|>"
// Qwen
||
t
.
first
==
"<|fim_pad|>"
// Qwen
||
t
.
first
==
"<fim-pad>"
||
t
.
first
==
"<fim-pad>"
||
t
.
first
==
"<fim_pad>"
// Granite
||
t
.
first
==
"<PAD>"
||
t
.
first
==
"<PAD>"
)
{
)
{
special_fim_pad_id
=
t
.
second
;
special_fim_pad_id
=
t
.
second
;
...
@@ -1905,6 +1909,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
@@ -1905,6 +1909,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
||
t
.
first
==
"<|repo_name|>"
||
t
.
first
==
"<|repo_name|>"
||
t
.
first
==
"<fim-repo>"
||
t
.
first
==
"<fim-repo>"
||
t
.
first
==
"<REPO>"
||
t
.
first
==
"<REPO>"
||
t
.
first
==
"<reponame>"
// Granite
)
{
)
{
special_fim_rep_id
=
t
.
second
;
special_fim_rep_id
=
t
.
second
;
if
((
id_to_token
[
t
.
second
].
attr
&
LLAMA_TOKEN_ATTR_CONTROL
)
==
0
)
{
if
((
id_to_token
[
t
.
second
].
attr
&
LLAMA_TOKEN_ATTR_CONTROL
)
==
0
)
{
...
...
llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
View file @
e9e5f61c
...
@@ -65,10 +65,10 @@ index 273075f4..dd11f304 100644
...
@@ -65,10 +65,10 @@ index 273075f4..dd11f304 100644
/* .init_tensor = */ NULL, // no initialization required
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index
cec36b36..4b057973
100644
index
e2617b06..242e50a7
100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -
53
0,6 +
53
0,7 @@
static void ggml_backend_cann_buffer_free_buffer(
@@ -
80
0,6 +
80
0,7 @@
static void ggml_backend_cann_buffer_free_buffer(
ggml_backend_cann_buffer_context* ctx =
ggml_backend_cann_buffer_context* ctx =
(ggml_backend_cann_buffer_context*)buffer->context;
(ggml_backend_cann_buffer_context*)buffer->context;
delete ctx;
delete ctx;
...
@@ -76,7 +76,7 @@ index cec36b36..4b057973 100644
...
@@ -76,7 +76,7 @@ index cec36b36..4b057973 100644
}
}
/**
/**
@@ -1
199
,6 +1
200
,7 @@
static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
@@ -1
472
,6 +1
473
,7 @@
static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
*/
*/
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
ACL_CHECK(aclrtFreeHost(buffer->context));
ACL_CHECK(aclrtFreeHost(buffer->context));
...
@@ -85,10 +85,10 @@ index cec36b36..4b057973 100644
...
@@ -85,10 +85,10 @@ index cec36b36..4b057973 100644
/**
/**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
fafe9633..59a49560
100644
index
a7febef7..31750b6f
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -53
3
,6 +53
3
,7 @@
struct ggml_backend_cuda_buffer_context {
@@ -53
4
,6 +53
4
,7 @@
struct ggml_backend_cuda_buffer_context {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
delete ctx;
delete ctx;
...
@@ -96,7 +96,7 @@ index fafe9633..59a49560 100644
...
@@ -96,7 +96,7 @@ index fafe9633..59a49560 100644
}
}
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
@@ -78
8
,6 +7
8
9,7 @@
struct ggml_backend_cuda_split_buffer_context {
@@ -78
9
,6 +79
0
,7 @@
struct ggml_backend_cuda_split_buffer_context {
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
delete ctx;
delete ctx;
...
@@ -104,7 +104,7 @@ index fafe9633..59a49560 100644
...
@@ -104,7 +104,7 @@ index fafe9633..59a49560 100644
}
}
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -106
1
,6 +106
3
,7 @@
static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
@@ -106
2
,6 +106
4
,7 @@
static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
CUDA_CHECK(cudaFreeHost(buffer->context));
CUDA_CHECK(cudaFreeHost(buffer->context));
...
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
...
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index
9f1c6c6c..310afe8a
100644
index
266d8af4..12886cd3
100644
--- a/ggml/src/ggml-metal/ggml-metal.m
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -4
641
,6 +4
641
,7 @@
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
@@ -4
759
,6 +4
759
,7 @@
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
}
}
free(ctx);
free(ctx);
...
@@ -137,10 +137,10 @@ index 9f1c6c6c..310afe8a 100644
...
@@ -137,10 +137,10 @@ index 9f1c6c6c..310afe8a 100644
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index
b8b5cbd3..14d4561b
100644
index
05a2f4e6..392cc18d
100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -1
443
,6 +1
443
,7 @@
struct ggml_backend_opencl_buffer_context {
@@ -1
940
,6 +1
940
,7 @@
struct ggml_backend_opencl_buffer_context {
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
delete ctx;
delete ctx;
...
@@ -149,10 +149,10 @@ index b8b5cbd3..14d4561b 100644
...
@@ -149,10 +149,10 @@ index b8b5cbd3..14d4561b 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index
862b9b66..34536681
100644
index
a0667b7d..bd83adc5
100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -4
43
,6 +4
43
,7 @@
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -4
68
,6 +4
68
,7 @@
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
GGML_ASSERT(status);
GGML_ASSERT(status);
delete ctx;
delete ctx;
...
@@ -161,7 +161,7 @@ index 862b9b66..34536681 100644
...
@@ -161,7 +161,7 @@ index 862b9b66..34536681 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index
3e48a924..a3d182fc
100644
index
1de34c96..4600f61e
100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -316,6 +316,7 @@
ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -316,6 +316,7 @@
ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
...
@@ -189,10 +189,10 @@ index 3e48a924..a3d182fc 100644
...
@@ -189,10 +189,10 @@ index 3e48a924..a3d182fc 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index
783a0ff8..8ac1e07e
100644
index
39f3cd34..c569a8a5
100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -863
9
,6 +863
9
,7 @@
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -86
5
3,6 +86
5
3,7 @@
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer);
ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx;
delete ctx;
...
@@ -200,7 +200,7 @@ index 783a0ff8..8ac1e07e 100644
...
@@ -200,7 +200,7 @@ index 783a0ff8..8ac1e07e 100644
}
}
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -87
82
,6 +87
83
,7 @@
static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
@@ -87
96
,6 +87
97
,7 @@
static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
...
...
llama/patches/0002-pretokenizer.patch
View file @
e9e5f61c
...
@@ -10,7 +10,7 @@ logs instead of throwing an error
...
@@ -10,7 +10,7 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-)
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 4
64ff01e..0125ee53
100644
index 4
8060517..a35b498c
100644
--- a/src/llama-vocab.cpp
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1491,16 +1491,7 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -1491,16 +1491,7 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
...
llama/patches/0003-embeddings.patch
View file @
e9e5f61c
...
@@ -11,10 +11,10 @@ instead of forcing one or the error
...
@@ -11,10 +11,10 @@ instead of forcing one or the error
1 file changed, 3 insertions(+), 3 deletions(-)
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index
4735e98e..65135172
100644
index
983385f8..32f59819
100644
--- a/src/llama-context.cpp
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -123
2
,7 +123
2
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -123
6
,7 +123
6
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
int64_t n_outputs_all = 0;
int64_t n_outputs_all = 0;
// count outputs
// count outputs
...
@@ -23,7 +23,7 @@ index 4735e98e..65135172 100644
...
@@ -23,7 +23,7 @@ index 4735e98e..65135172 100644
for (uint32_t i = 0; i < n_tokens_all; ++i) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs_all += batch.logits[i] != 0;
n_outputs_all += batch.logits[i] != 0;
}
}
@@ -134
4
,7 +134
4
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -134
8
,7 +134
8
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
//}
//}
...
@@ -32,7 +32,7 @@ index 4735e98e..65135172 100644
...
@@ -32,7 +32,7 @@ index 4735e98e..65135172 100644
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
if (t_embd && res->get_embd_pooled()) {
if (t_embd && res->get_embd_pooled()) {
@@ -14
88
,7 +14
88
,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
@@ -14
92
,7 +14
92
,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto n_embd = hparams.n_embd;
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
// TODO: use a per-batch flag for logits presence instead
...
...
llama/patches/0004-clip-unicode.patch
View file @
e9e5f61c
...
@@ -10,12 +10,12 @@ filesystems for paths that include wide characters
...
@@ -10,12 +10,12 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+)
1 file changed, 39 insertions(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index
49c90b75..4b72ea9f
100644
index
75970615..d57b4bd6
100644
--- a/examples/llava/clip.cpp
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -28,6 +28,19 @@
@@ -29,6 +29,19 @@
#include <cinttypes>
#include <limits>
#include <limits>
#include <array>
+#if defined(_WIN32)
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
...
@@ -33,7 +33,7 @@ index 49c90b75..4b72ea9f 100644
...
@@ -33,7 +33,7 @@ index 49c90b75..4b72ea9f 100644
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS
//#define CLIP_DEBUG_FUNCTIONS
@@ -14
29
,7 +144
2
,29 @@
struct clip_model_loader {
@@ -14
30
,7 +144
3
,29 @@
struct clip_model_loader {
{
{
std::vector<uint8_t> read_buf;
std::vector<uint8_t> read_buf;
...
@@ -63,7 +63,7 @@ index 49c90b75..4b72ea9f 100644
...
@@ -63,7 +63,7 @@ index 49c90b75..4b72ea9f 100644
if (!fin) {
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
}
@@ -145
6
,7 +149
1
,11 @@
struct clip_model_loader {
@@ -145
7
,7 +149
2
,11 @@
struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
}
}
...
...
llama/patches/0005-solar-pro.patch
View file @
e9e5f61c
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
From: jmorganca <jmorganca@gmail.com>
Date:
Tue, 8
Apr 2025 16:
03:51
-0700
Date:
Sun, 20
Apr 2025 16:
11:09
-0700
Subject: [PATCH] solar-pro
Subject: [PATCH] solar-pro
adds support for the Solar Pro architecture
adds support for the Solar Pro architecture
...
@@ -15,7 +15,7 @@ adds support for the Solar Pro architecture
...
@@ -15,7 +15,7 @@ adds support for the Solar Pro architecture
7 files changed, 248 insertions(+)
7 files changed, 248 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index
a6fddc7f..0b0fedcd
100644
index
62e1480b..f754bc8f
100644
--- a/src/llama-arch.cpp
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -68,6 +68,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -68,6 +68,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
...
@@ -31,10 +31,10 @@ index a6fddc7f..0b0fedcd 100644
...
@@ -31,10 +31,10 @@ index a6fddc7f..0b0fedcd 100644
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
@@ -1482,6 +1484,24 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -1478,6 +1480,24 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
},
},
...
@@ -59,7 +59,7 @@ index a6fddc7f..0b0fedcd 100644
...
@@ -59,7 +59,7 @@ index a6fddc7f..0b0fedcd 100644
{
{
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_WAVTOKENIZER_DEC,
{
{
@@ -16
71
,6 +16
91
,7 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -16
60
,6 +16
80
,7 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...
@@ -68,7 +68,7 @@ index a6fddc7f..0b0fedcd 100644
...
@@ -68,7 +68,7 @@ index a6fddc7f..0b0fedcd 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
diff --git a/src/llama-arch.h b/src/llama-arch.h
index
2c2099b3..74aa3dd0
100644
index
98ca00a1..439aaeab
100644
--- a/src/llama-arch.h
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -72,6 +72,7 @@
enum llm_arch {
@@ -72,6 +72,7 @@
enum llm_arch {
...
@@ -84,10 +84,10 @@ index 2c2099b3..74aa3dd0 100644
...
@@ -84,10 +84,10 @@ index 2c2099b3..74aa3dd0 100644
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_SCALE,
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
LLM_KV_ROPE_DIMENSION_COUNT,
@@ -344,6 +346,7 @@
enum llm_tensor {
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -340,6 +342,7 @@
enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_CLS_OUT,
...
@@ -115,10 +115,10 @@ index 90dfe7a7..8a667960 100644
...
@@ -115,10 +115,10 @@ index 90dfe7a7..8a667960 100644
if (il < n_layer) {
if (il < n_layer) {
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index
4e0b5719..c3147cbc
100644
index
80fcd65d..6e278945
100644
--- a/src/llama-hparams.h
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -5
1
,6 +5
1
,8 @@
struct llama_hparams {
@@ -5
5
,6 +5
5
,8 @@
struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
...
@@ -127,7 +127,7 @@ index 4e0b5719..c3147cbc 100644
...
@@ -127,7 +127,7 @@ index 4e0b5719..c3147cbc 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
uint32_t n_lora_kv = 0;
@@ -1
49
,6 +15
1
,9 @@
struct llama_hparams {
@@ -1
53
,6 +15
5
,9 @@
struct llama_hparams {
// dimension of the recurrent state embeddings
// dimension of the recurrent state embeddings
uint32_t n_embd_v_s() const;
uint32_t n_embd_v_s() const;
...
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
...
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
llama_model_loader::llama_model_loader(
llama_model_loader::llama_model_loader(
const std::string & fname,
const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index b7
4dd72c..5fbd0055
100644
index
6
b7
bfecf..aba42819
100644
--- a/src/llama-model.cpp
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -137
2
,6 +137
2
,21 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -137
4
,6 +137
4
,21 @@
void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
default: type = LLM_TYPE_UNKNOWN;
}
}
} break;
} break;
...
@@ -175,7 +175,7 @@ index b74dd72c..5fbd0055 100644
...
@@ -175,7 +175,7 @@ index b74dd72c..5fbd0055 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_WAVTOKENIZER_DEC:
{
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -37
0
1,6 +37
16
,34 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -371
7
,6 +37
32
,34 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...
@@ -210,7 +210,7 @@ index b74dd72c..5fbd0055 100644
...
@@ -210,7 +210,7 @@ index b74dd72c..5fbd0055 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -122
44
,6 +12
287
,165 @@
struct llm_build_chameleon : public llm_graph_context {
@@ -122
96
,6 +12
339
,165 @@
struct llm_build_chameleon : public llm_graph_context {
}
}
};
};
...
@@ -316,7 +316,7 @@ index b74dd72c..5fbd0055 100644
...
@@ -316,7 +316,7 @@ index b74dd72c..5fbd0055 100644
+
+
+ cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+ Qcur, Kcur, Vcur, nullptr,
nullptr,
kq_scale, il);
+ cb(cur, "attn_out", il);
+ cb(cur, "attn_out", il);
+ }
+ }
+
+
...
@@ -376,7 +376,7 @@ index b74dd72c..5fbd0055 100644
...
@@ -376,7 +376,7 @@ index b74dd72c..5fbd0055 100644
struct llm_build_wavtokenizer_dec : public llm_graph_context {
struct llm_build_wavtokenizer_dec : public llm_graph_context {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
ggml_tensor * cur;
ggml_tensor * cur;
@@ -1
2993
,6 +13
195
,10 @@
llm_graph_result_ptr llama_model::build_graph(
@@ -1
3045
,6 +13
247
,10 @@
llm_graph_result_ptr llama_model::build_graph(
{
{
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
} break;
} break;
...
@@ -387,7 +387,7 @@ index b74dd72c..5fbd0055 100644
...
@@ -387,7 +387,7 @@ index b74dd72c..5fbd0055 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_WAVTOKENIZER_DEC:
{
{
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
@@ -131
3
9,6 +133
45
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -1319
1
,6 +133
97
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
case LLM_ARCH_CHAMELEON:
...
@@ -396,7 +396,7 @@ index b74dd72c..5fbd0055 100644
...
@@ -396,7 +396,7 @@ index b74dd72c..5fbd0055 100644
return LLAMA_ROPE_TYPE_NORM;
return LLAMA_ROPE_TYPE_NORM;
diff --git a/src/llama-model.h b/src/llama-model.h
diff --git a/src/llama-model.h b/src/llama-model.h
index
0f18dac1..e08d4ae4
100644
index
fd82d106..5865d5e9
100644
--- a/src/llama-model.h
--- a/src/llama-model.h
+++ b/src/llama-model.h
+++ b/src/llama-model.h
@@ -62,6 +62,7 @@
enum llm_type {
@@ -62,6 +62,7 @@
enum llm_type {
...
@@ -407,7 +407,7 @@ index 0f18dac1..e08d4ae4 100644
...
@@ -407,7 +407,7 @@ index 0f18dac1..e08d4ae4 100644
LLM_TYPE_30B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_32B,
LLM_TYPE_34B,
LLM_TYPE_34B,
@@ -30
5
,6 +30
6
,8 @@
struct llama_layer {
@@ -30
7
,6 +30
8
,8 @@
struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
...
...
llama/patches/000
7
-add-mllama-support.patch
→
llama/patches/000
6
-add-mllama-support.patch
View file @
e9e5f61c
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
From: jmorganca <jmorganca@gmail.com>
Date:
Tue, 8
Apr 2025 1
9:27:12
-0700
Date:
Sun, 20
Apr 2025 1
6:12:36
-0700
Subject: [PATCH] add mllama support
Subject: [PATCH] add mllama support
adds support for the llama 3.2 vision architecture
adds support for the llama 3.2 vision architecture
...
@@ -28,7 +28,7 @@ adds support for the llama 3.2 vision architecture
...
@@ -28,7 +28,7 @@ adds support for the llama 3.2 vision architecture
20 files changed, 475 insertions(+), 22 deletions(-)
20 files changed, 475 insertions(+), 22 deletions(-)
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index
91a07e2a..13127c7b
100644
index
3d566475..654d1358
100644
--- a/examples/llava/gemma3-cli.cpp
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -106,7 +106,7 @@
struct decode_embd_batch {
@@ -106,7 +106,7 @@
struct decode_embd_batch {
...
@@ -79,10 +79,10 @@ index 03a22cbb..5eb40bcd 100644
...
@@ -79,10 +79,10 @@ index 03a22cbb..5eb40bcd 100644
LOG_ERR("%s : failed to eval\n", __func__);
LOG_ERR("%s : failed to eval\n", __func__);
return false;
return false;
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index
114c274b..a0e649ad
100644
index
3fd5bebc..f0cec596
100644
--- a/examples/llava/mtmd.cpp
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -2
1
3,7 +2
1
3,7 @@
struct decode_embd_batch {
@@ -2
3
3,7 +2
3
3,7 @@
struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
std::vector<int8_t> logits;
llama_batch batch;
llama_batch batch;
...
@@ -91,7 +91,7 @@ index 114c274b..a0e649ad 100644
...
@@ -91,7 +91,7 @@ index 114c274b..a0e649ad 100644
pos .resize(n_tokens);
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
seq_ids .resize(n_tokens + 1);
@@ -2
2
5,6 +2
2
5,7 @@
struct decode_embd_batch {
@@ -2
4
5,6 +2
4
5,7 @@
struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*embd =*/ embd,
...
@@ -99,9 +99,9 @@ index 114c274b..a0e649ad 100644
...
@@ -99,9 +99,9 @@ index 114c274b..a0e649ad 100644
/*pos =*/ pos.data(),
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*seq_id =*/ seq_ids.data(),
@@ -
29
1,7 +
29
2,8 @@
int32_t mtmd_helper_eval(mtmd_context * ctx,
@@ -
31
1,7 +
31
2,8 @@
int32_t mtmd_helper_eval(mtmd_context * ctx,
int32_t n_tokens = chunk.tokens_image
->n_tokens(
);
int32_t n_tokens =
mtmd_image_tokens_get_n_tokens(
chunk.tokens_image
.get()
);
float * embd = mtmd_get_output_embd(ctx);
float * embd = mtmd_get_output_embd(ctx);
- decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
- decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
...
@@ -158,7 +158,7 @@ index 5657fbf0..f91896e4 100644
...
@@ -158,7 +158,7 @@ index 5657fbf0..f91896e4 100644
LLAMA_API void llama_free(struct llama_context * ctx);
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index
0b0fedcd..c1f78618
100644
index
f754bc8f..0568565f
100644
--- a/src/llama-arch.cpp
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@
@@ -6,6 +6,7 @@
...
@@ -174,10 +174,10 @@ index 0b0fedcd..c1f78618 100644
...
@@ -174,10 +174,10 @@ index 0b0fedcd..c1f78618 100644
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
@@ -271,6 +273,40 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -269,6 +271,40 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
},
},
...
@@ -218,7 +218,7 @@ index 0b0fedcd..c1f78618 100644
...
@@ -218,7 +218,7 @@ index 0b0fedcd..c1f78618 100644
{
{
LLM_ARCH_DECI,
LLM_ARCH_DECI,
{
{
@@ -16
92
,6 +17
28
,14 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -16
81
,6 +17
17
,14 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
...
@@ -234,7 +234,7 @@ index 0b0fedcd..c1f78618 100644
...
@@ -234,7 +234,7 @@ index 0b0fedcd..c1f78618 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
diff --git a/src/llama-arch.h b/src/llama-arch.h
index
74aa3dd0..f987844d
100644
index
439aaeab..6a989034
100644
--- a/src/llama-arch.h
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -11,6 +11,7 @@
@@ -11,6 +11,7 @@
...
@@ -250,10 +250,10 @@ index 74aa3dd0..f987844d 100644
...
@@ -250,10 +250,10 @@ index 74aa3dd0..f987844d 100644
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
LLM_KV_ROPE_DIMENSION_COUNT,
@@ -347,6 +349,14 @@
enum llm_tensor {
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -343,6 +345,14 @@
enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
LLM_TENSOR_BSKCN_TV,
...
@@ -297,10 +297,10 @@ index 01d5ca57..8682b0e6 100644
...
@@ -297,10 +297,10 @@ index 01d5ca57..8682b0e6 100644
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
}
}
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index
65135172..afe6f552
100644
index
32f59819..0343ba8a
100644
--- a/src/llama-context.cpp
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -8
58
,7 +8
58
,7 @@
float * llama_context::get_logits_ith(int32_t i) {
@@ -8
62
,7 +8
62
,7 @@
float * llama_context::get_logits_ith(int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
}
}
...
@@ -309,7 +309,7 @@ index 65135172..afe6f552 100644
...
@@ -309,7 +309,7 @@ index 65135172..afe6f552 100644
} catch (const std::exception & err) {
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
#ifndef NDEBUG
@@ -9
79
,6 +9
79
,10 @@
void llama_context::set_warmup(bool value) {
@@ -9
83
,6 +9
83
,10 @@
void llama_context::set_warmup(bool value) {
cparams.warmup = value;
cparams.warmup = value;
}
}
...
@@ -320,7 +320,7 @@ index 65135172..afe6f552 100644
...
@@ -320,7 +320,7 @@ index 65135172..afe6f552 100644
void llama_context::set_adapter_lora(
void llama_context::set_adapter_lora(
llama_adapter_lora * adapter,
llama_adapter_lora * adapter,
float scale) {
float scale) {
@@ -105
4
,7 +10
58
,7 @@
int llama_context::encode(llama_batch & inp_batch) {
@@ -105
8
,7 +10
62
,7 @@
int llama_context::encode(llama_batch & inp_batch) {
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd;
...
@@ -329,7 +329,7 @@ index 65135172..afe6f552 100644
...
@@ -329,7 +329,7 @@ index 65135172..afe6f552 100644
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@@ -119
4
,10 +1
198
,9 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -119
8
,10 +1
202
,9 @@
int llama_context::decode(llama_batch & inp_batch) {
const llama_batch & batch = batch_allocr.batch;
const llama_batch & batch = batch_allocr.batch;
...
@@ -341,7 +341,7 @@ index 65135172..afe6f552 100644
...
@@ -341,7 +341,7 @@ index 65135172..afe6f552 100644
const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd;
@@ -124
5
,7 +12
48
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -124
9
,7 +12
52
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
const bool logits_all = n_outputs_all == n_tokens_all;
const bool logits_all = n_outputs_all == n_tokens_all;
...
@@ -350,7 +350,7 @@ index 65135172..afe6f552 100644
...
@@ -350,7 +350,7 @@ index 65135172..afe6f552 100644
/* simple_split */ !kv_self->recurrent,
/* simple_split */ !kv_self->recurrent,
/* logits_all */ logits_all);
/* logits_all */ logits_all);
@@ -14
79
,12 +148
2
,11 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -14
83
,12 +148
6
,11 @@
int llama_context::decode(llama_batch & inp_batch) {
int32_t llama_context::output_reserve(int32_t n_outputs) {
int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams;
const auto & hparams = model.hparams;
...
@@ -364,7 +364,7 @@ index 65135172..afe6f552 100644
...
@@ -364,7 +364,7 @@ index 65135172..afe6f552 100644
const auto n_embd = hparams.n_embd;
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
// TODO: use a per-batch flag for logits presence instead
@@ -155
4
,7 +15
5
6,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
@@ -155
8
,7 +156
0
,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
void llama_context::output_reorder() {
void llama_context::output_reorder() {
auto & out_ids = sbatch.out_ids;
auto & out_ids = sbatch.out_ids;
if (!out_ids.empty()) {
if (!out_ids.empty()) {
...
@@ -373,7 +373,7 @@ index 65135172..afe6f552 100644
...
@@ -373,7 +373,7 @@ index 65135172..afe6f552 100644
const uint32_t n_embd = model.hparams.n_embd;
const uint32_t n_embd = model.hparams.n_embd;
GGML_ASSERT((size_t) n_outputs == out_ids.size());
GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -206
1
,7 +206
3
,7 @@
size_t llama_context::state_write_data(llama_io_write_i & io) {
@@ -206
5
,7 +206
7
,7 @@
size_t llama_context::state_write_data(llama_io_write_i & io) {
{
{
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
...
@@ -382,7 +382,7 @@ index 65135172..afe6f552 100644
...
@@ -382,7 +382,7 @@ index 65135172..afe6f552 100644
io.write(&logits_size, sizeof(logits_size));
io.write(&logits_size, sizeof(logits_size));
@@ -224
4
,6 +22
46
,7 @@
llama_context_params llama_context_default_params() {
@@ -224
8
,6 +22
50
,7 @@
llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true,
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
/*.no_perf =*/ true,
...
@@ -390,7 +390,7 @@ index 65135172..afe6f552 100644
...
@@ -390,7 +390,7 @@ index 65135172..afe6f552 100644
/*.abort_callback =*/ nullptr,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
};
@@ -237
1
,6 +237
4
,10 @@
void llama_set_warmup(llama_context * ctx, bool warmup) {
@@ -237
5
,6 +237
8
,10 @@
void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx->set_warmup(warmup);
ctx->set_warmup(warmup);
}
}
...
@@ -426,7 +426,7 @@ index 30e550f0..85ad91b9 100644
...
@@ -426,7 +426,7 @@ index 30e550f0..85ad91b9 100644
enum llama_pooling_type pooling_type;
enum llama_pooling_type pooling_type;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index
cd955d63..83f3c5a8
100644
index
a85e9728..d740c120
100644
--- a/src/llama-graph.cpp
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -546,6 +546,12 @@
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
@@ -546,6 +546,12 @@
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
...
@@ -442,7 +442,7 @@ index cd955d63..83f3c5a8 100644
...
@@ -442,7 +442,7 @@ index cd955d63..83f3c5a8 100644
//
//
// llm_graph_context
// llm_graph_context
//
//
@@ -1
495
,6 +15
0
1,25 @@
llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
@@ -1
506
,6 +151
2
,25 @@
llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
}
}
...
@@ -469,7 +469,7 @@ index cd955d63..83f3c5a8 100644
...
@@ -469,7 +469,7 @@ index cd955d63..83f3c5a8 100644
llm_graph_input_attn_cross * inp,
llm_graph_input_attn_cross * inp,
ggml_cgraph * gf,
ggml_cgraph * gf,
diff --git a/src/llama-graph.h b/src/llama-graph.h
diff --git a/src/llama-graph.h b/src/llama-graph.h
index
5b6618f9..51993998
100644
index
d192dc14..260a2af2
100644
--- a/src/llama-graph.h
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -86,6 +86,7 @@
public:
@@ -86,6 +86,7 @@
public:
...
@@ -518,7 +518,7 @@ index 8a667960..6a02de03 100644
...
@@ -518,7 +518,7 @@ index 8a667960..6a02de03 100644
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+}
+}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index
c3147cbc..4567a0e9
100644
index
6e278945..c8a34d52
100644
--- a/src/llama-hparams.h
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -2,6 +2,8 @@
@@ -2,6 +2,8 @@
...
@@ -536,9 +536,9 @@ index c3147cbc..4567a0e9 100644
...
@@ -536,9 +536,9 @@ index c3147cbc..4567a0e9 100644
uint32_t n_rel_attn_bkts = 0;
uint32_t n_rel_attn_bkts = 0;
+ uint32_t n_vocab = 0;
+ uint32_t n_vocab = 0;
//
for WavTokenizer
//
note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
struct llama_hparams_posnet posnet
;
uint32_t n_embd_head_k_mla = 0
;
@@ -5
2
,6 +5
5
,7 @@
struct llama_hparams {
@@ -5
6
,6 +5
9
,7 @@
struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
...
@@ -546,7 +546,7 @@ index c3147cbc..4567a0e9 100644
...
@@ -546,7 +546,7 @@ index c3147cbc..4567a0e9 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_q = 0;
@@ -15
4
,6 +1
58
,9 @@
struct llama_hparams {
@@ -15
8
,6 +1
62
,9 @@
struct llama_hparams {
// Block skip connection
// Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const;
bool n_bskcn(uint32_t n, uint32_t il) const;
...
@@ -557,7 +557,7 @@ index c3147cbc..4567a0e9 100644
...
@@ -557,7 +557,7 @@ index c3147cbc..4567a0e9 100644
};
};
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index
dbf5f118..9310f262
100644
index
7c9d46d8..69f8d35a
100644
--- a/src/llama-kv-cache.cpp
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -95,8 +95,16 @@
bool llama_kv_cache_unified::init(
@@ -95,8 +95,16 @@
bool llama_kv_cache_unified::init(
...
@@ -593,7 +593,7 @@ index a012aeae..2e11507d 100644
...
@@ -593,7 +593,7 @@ index a012aeae..2e11507d 100644
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str());
const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index
5fbd0055..d5ad466e
100644
index
aba42819..d051696c
100644
--- a/src/llama-model.cpp
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -419,6 +419,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -419,6 +419,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -650,7 +650,7 @@ index 5fbd0055..d5ad466e 100644
...
@@ -650,7 +650,7 @@ index 5fbd0055..d5ad466e 100644
case LLM_ARCH_DECI:
case LLM_ARCH_DECI:
{
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -15
48
,7 +156
2
,7 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -15
50
,7 +156
4
,7 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff();
const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa;
const int64_t n_embd_gqa = n_embd_v_gqa;
...
@@ -659,7 +659,7 @@ index 5fbd0055..d5ad466e 100644
...
@@ -659,7 +659,7 @@ index 5fbd0055..d5ad466e 100644
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot;
const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert;
const int64_t n_expert = hparams.n_expert;
@@ -180
1
,6 +181
5
,52 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -180
3
,6 +181
7
,52 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
}
}
} break;
} break;
...
@@ -712,7 +712,7 @@ index 5fbd0055..d5ad466e 100644
...
@@ -712,7 +712,7 @@ index 5fbd0055..d5ad466e 100644
case LLM_ARCH_DECI:
case LLM_ARCH_DECI:
{
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -46
65
,6 +47
25
,246 @@
struct llm_build_llama : public llm_graph_context {
@@ -46
83
,6 +47
43
,246 @@
struct llm_build_llama : public llm_graph_context {
}
}
};
};
...
@@ -900,7 +900,7 @@ index 5fbd0055..d5ad466e 100644
...
@@ -900,7 +900,7 @@ index 5fbd0055..d5ad466e 100644
+
+
+ cur = build_attn(inp_attn, gf,
+ cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr,
nullptr,
1.0f/sqrtf(float(n_embd_head)), il);
+
+
+ if (il == n_layer - 1) {
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ // skip computing output for unused tokens
...
@@ -959,7 +959,7 @@ index 5fbd0055..d5ad466e 100644
...
@@ -959,7 +959,7 @@ index 5fbd0055..d5ad466e 100644
struct llm_build_deci : public llm_graph_context {
struct llm_build_deci : public llm_graph_context {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -1
2965
,6 +13
265
,10 @@
llm_graph_result_ptr llama_model::build_graph(
@@ -1
3017
,6 +13
317
,10 @@
llm_graph_result_ptr llama_model::build_graph(
{
{
llm = std::make_unique<llm_build_llama>(*this, params, gf);
llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break;
} break;
...
@@ -970,7 +970,7 @@ index 5fbd0055..d5ad466e 100644
...
@@ -970,7 +970,7 @@ index 5fbd0055..d5ad466e 100644
case LLM_ARCH_DECI:
case LLM_ARCH_DECI:
{
{
llm = std::make_unique<llm_build_deci>(*this, params, gf);
llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -133
25
,6 +136
29
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -133
77
,6 +136
81
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA4:
case LLM_ARCH_LLAMA4:
...
@@ -979,7 +979,7 @@ index 5fbd0055..d5ad466e 100644
...
@@ -979,7 +979,7 @@ index 5fbd0055..d5ad466e 100644
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
diff --git a/src/llama-model.h b/src/llama-model.h
index
e08d4ae4..21c4617b
100644
index
5865d5e9..72bab5be
100644
--- a/src/llama-model.h
--- a/src/llama-model.h
+++ b/src/llama-model.h
+++ b/src/llama-model.h
@@ -11,6 +11,7 @@
@@ -11,6 +11,7 @@
...
@@ -998,7 +998,7 @@ index e08d4ae4..21c4617b 100644
...
@@ -998,7 +998,7 @@ index e08d4ae4..21c4617b 100644
LLM_TYPE_236B,
LLM_TYPE_236B,
LLM_TYPE_314B,
LLM_TYPE_314B,
LLM_TYPE_671B,
LLM_TYPE_671B,
@@ -30
8
,6 +31
0
,16 @@
struct llama_layer {
@@ -3
1
0,6 +31
2
,16 @@
struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
struct ggml_tensor * bskcn_tv = nullptr;
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment