Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
fc97bbd8
"vscode:/vscode.git/clone" did not exist on "15cbe5f70adaade1a8a11afc37601fc6606e7e0d"
Commit
fc97bbd8
authored
Mar 11, 2026
by
qinyiqun
Browse files
Fix: Add lifecycle management to AWQ linear function
parent
f692d681
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
3 additions
and
2 deletions
+3
-2
csrc/layers/fused_linear.hpp
csrc/layers/fused_linear.hpp
+1
-1
csrc/models/llama/llama_attention.cpp
csrc/models/llama/llama_attention.cpp
+2
-1
No files found.
csrc/layers/fused_linear.hpp
View file @
fc97bbd8
...
@@ -207,7 +207,7 @@ private:
...
@@ -207,7 +207,7 @@ private:
#define INFINILM_QKV_LINEAR_W4A16AWQ_INIT(name, q_name, k_name, v_name, ...) \
#define INFINILM_QKV_LINEAR_W4A16AWQ_INIT(name, q_name, k_name, v_name, ...) \
name##_ = std::make_shared<layers::QKVParallelLinear>(__VA_ARGS__); \
name##_ = std::make_shared<layers::QKVParallelLinear>(__VA_ARGS__); \
auto awq_ptr = std::static_pointer_cast<infinicore::quantization::AWQ>(
this->
quantization
_);
\
auto awq_ptr = std::static_pointer_cast<infinicore::quantization::AWQ>(
name##_->get_
quantization
());
\
int packing_num = awq_ptr->get_packing_num(); \
int packing_num = awq_ptr->get_packing_num(); \
this->register_parameter(std::string(q_name) + ".qweight", name##_->get_q_weight_awq(packing_num)); \
this->register_parameter(std::string(q_name) + ".qweight", name##_->get_q_weight_awq(packing_num)); \
this->register_parameter(std::string(q_name) + ".qzeros", name##_->get_q_weight_zeros_awq(packing_num)); \
this->register_parameter(std::string(q_name) + ".qzeros", name##_->get_q_weight_zeros_awq(packing_num)); \
...
...
csrc/models/llama/llama_attention.cpp
View file @
fc97bbd8
...
@@ -112,12 +112,13 @@ LlamaAttention::LlamaAttention(std::shared_ptr<infinilm::config::ModelConfig> mo
...
@@ -112,12 +112,13 @@ LlamaAttention::LlamaAttention(std::shared_ptr<infinilm::config::ModelConfig> mo
dtype
,
device
,
tp_rank
,
tp_size
,
rank_info
.
comm
);
dtype
,
device
,
tp_rank
,
tp_size
,
rank_info
.
comm
);
break
;
break
;
case
infinicore
::
quantization
::
QuantScheme
::
AWQ_W4A16
:
case
infinicore
::
quantization
::
QuantScheme
::
AWQ_W4A16
:
{
INFINILM_QKV_LINEAR_W4A16AWQ_INIT
(
qkv_proj
,
"q_proj"
,
"k_proj"
,
"v_proj"
,
hidden_size_
,
head_dim_
,
model_config_
->
get
<
size_t
>
(
"num_attention_heads"
),
model_config_
->
get
<
size_t
>
(
"num_key_value_heads"
),
this
->
model_config_
->
get_quantization_method
(),
use_bias_
,
INFINILM_QKV_LINEAR_W4A16AWQ_INIT
(
qkv_proj
,
"q_proj"
,
"k_proj"
,
"v_proj"
,
hidden_size_
,
head_dim_
,
model_config_
->
get
<
size_t
>
(
"num_attention_heads"
),
model_config_
->
get
<
size_t
>
(
"num_key_value_heads"
),
this
->
model_config_
->
get_quantization_method
(),
use_bias_
,
dtype
,
device
,
rank_info
);
dtype
,
device
,
rank_info
);
INFINICORE_NN_MODULE_INIT
(
o_proj
,
model_config_
->
get
<
size_t
>
(
"num_attention_heads"
)
*
head_dim_
,
hidden_size_
,
this
->
model_config_
->
get_quantization_method
(),
use_output_bias_
,
INFINICORE_NN_MODULE_INIT
(
o_proj
,
model_config_
->
get
<
size_t
>
(
"num_attention_heads"
)
*
head_dim_
,
hidden_size_
,
this
->
model_config_
->
get_quantization_method
(),
use_output_bias_
,
dtype
,
device
,
tp_rank
,
tp_size
,
rank_info
.
comm
);
dtype
,
device
,
tp_rank
,
tp_size
,
rank_info
.
comm
);
break
;
break
;
}
default:
default:
INFINILM_QKV_LINEAR_INIT
(
qkv_proj
,
"q_proj"
,
"k_proj"
,
"v_proj"
,
hidden_size_
,
head_dim_
,
model_config_
->
get
<
size_t
>
(
"num_attention_heads"
),
model_config_
->
get
<
size_t
>
(
"num_key_value_heads"
),
this
->
model_config_
->
get_quantization_method
(),
use_bias_
,
INFINILM_QKV_LINEAR_INIT
(
qkv_proj
,
"q_proj"
,
"k_proj"
,
"v_proj"
,
hidden_size_
,
head_dim_
,
model_config_
->
get
<
size_t
>
(
"num_attention_heads"
),
model_config_
->
get
<
size_t
>
(
"num_key_value_heads"
),
this
->
model_config_
->
get_quantization_method
(),
use_bias_
,
dtype
,
device
,
rank_info
);
dtype
,
device
,
rank_info
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment