Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
f692d681
Commit
f692d681
authored
Feb 27, 2026
by
qinyiqun
Browse files
Issue/243:支持w4a16 awq fp16推理
parent
e76bb324
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
133 additions
and
27 deletions
+133
-27
csrc/layers/fused_linear.cpp
csrc/layers/fused_linear.cpp
+77
-0
csrc/layers/fused_linear.hpp
csrc/layers/fused_linear.hpp
+56
-27
No files found.
csrc/layers/fused_linear.cpp
View file @
f692d681
...
@@ -170,6 +170,58 @@ infinicore::nn::Parameter QKVParallelLinear::get_v_weight_scale() const {
...
@@ -170,6 +170,58 @@ infinicore::nn::Parameter QKVParallelLinear::get_v_weight_scale() const {
0
,
tp_rank_
,
tp_size_
);
0
,
tp_rank_
,
tp_size_
);
}
}
infinicore
::
nn
::
Parameter
QKVParallelLinear
::
get_q_weight_awq
(
int
scaling_factor
)
const
{
return
infinicore
::
nn
::
Parameter
(
weight_
->
narrow
({{
1
,
0
,
q_out_size_
/
scaling_factor
}}),
1
,
tp_rank_
,
tp_size_
);
}
infinicore
::
nn
::
Parameter
QKVParallelLinear
::
get_k_weight_awq
(
int
scaling_factor
)
const
{
return
infinicore
::
nn
::
Parameter
(
weight_
->
narrow
({{
1
,
q_out_size_
/
scaling_factor
,
k_out_size_
/
scaling_factor
}}),
1
,
tp_rank_
,
tp_size_
);
}
infinicore
::
nn
::
Parameter
QKVParallelLinear
::
get_v_weight_awq
(
int
scaling_factor
)
const
{
return
infinicore
::
nn
::
Parameter
(
weight_
->
narrow
({{
1
,
(
q_out_size_
+
k_out_size_
)
/
scaling_factor
,
v_out_size_
/
scaling_factor
}}),
1
,
tp_rank_
,
tp_size_
);
}
infinicore
::
nn
::
Parameter
QKVParallelLinear
::
get_q_weight_scale_awq
(
int
scaling_factor
)
const
{
return
infinicore
::
nn
::
Parameter
(
weight_scale_
->
narrow
({{
1
,
0
,
q_out_size_
/
scaling_factor
}}),
1
,
tp_rank_
,
tp_size_
);
}
infinicore
::
nn
::
Parameter
QKVParallelLinear
::
get_k_weight_scale_awq
(
int
scaling_factor
)
const
{
return
infinicore
::
nn
::
Parameter
(
weight_scale_
->
narrow
({{
1
,
q_out_size_
/
scaling_factor
,
k_out_size_
/
scaling_factor
}}),
1
,
tp_rank_
,
tp_size_
);
}
infinicore
::
nn
::
Parameter
QKVParallelLinear
::
get_v_weight_scale_awq
(
int
scaling_factor
)
const
{
return
infinicore
::
nn
::
Parameter
(
weight_scale_
->
narrow
({{
1
,
(
q_out_size_
+
k_out_size_
)
/
scaling_factor
,
v_out_size_
/
scaling_factor
}}),
1
,
tp_rank_
,
tp_size_
);
}
infinicore
::
nn
::
Parameter
QKVParallelLinear
::
get_q_weight_zeros_awq
(
int
scaling_factor
)
const
{
return
infinicore
::
nn
::
Parameter
(
weight_zeros_
->
narrow
({{
1
,
0
,
q_out_size_
/
scaling_factor
}}),
1
,
tp_rank_
,
tp_size_
);
}
infinicore
::
nn
::
Parameter
QKVParallelLinear
::
get_k_weight_zeros_awq
(
int
scaling_factor
)
const
{
return
infinicore
::
nn
::
Parameter
(
weight_zeros_
->
narrow
({{
1
,
q_out_size_
/
scaling_factor
,
k_out_size_
/
scaling_factor
}}),
1
,
tp_rank_
,
tp_size_
);
}
infinicore
::
nn
::
Parameter
QKVParallelLinear
::
get_v_weight_zeros_awq
(
int
scaling_factor
)
const
{
return
infinicore
::
nn
::
Parameter
(
weight_zeros_
->
narrow
({{
1
,
(
q_out_size_
+
k_out_size_
)
/
scaling_factor
,
v_out_size_
/
scaling_factor
}}),
1
,
tp_rank_
,
tp_size_
);
}
infinicore
::
nn
::
Parameter
QKVParallelLinear
::
get_q_weight_zeros
()
const
{
infinicore
::
nn
::
Parameter
QKVParallelLinear
::
get_q_weight_zeros
()
const
{
return
infinicore
::
nn
::
Parameter
(
return
infinicore
::
nn
::
Parameter
(
weight_zeros_
->
narrow
({{
0
,
0
,
q_out_size_
}}),
0
,
tp_rank_
,
tp_size_
);
weight_zeros_
->
narrow
({{
0
,
0
,
q_out_size_
}}),
0
,
tp_rank_
,
tp_size_
);
...
@@ -320,4 +372,29 @@ bool GateUpParallelLinear::has_gate_bias() const {
...
@@ -320,4 +372,29 @@ bool GateUpParallelLinear::has_gate_bias() const {
bool
GateUpParallelLinear
::
has_up_bias
()
const
{
bool
GateUpParallelLinear
::
has_up_bias
()
const
{
return
up_bias_
;
return
up_bias_
;
}
}
infinicore
::
nn
::
Parameter
GateUpParallelLinear
::
get_gate_weight_awq
()
const
{
return
infinicore
::
nn
::
Parameter
(
weight_
->
narrow
({{
1
,
0
,
weight_
->
size
(
1
)
/
2
}}),
1
,
tp_rank_
,
tp_size_
);
}
infinicore
::
nn
::
Parameter
GateUpParallelLinear
::
get_up_weight_awq
()
const
{
return
infinicore
::
nn
::
Parameter
(
weight_
->
narrow
({{
1
,
weight_
->
size
(
1
)
/
2
,
weight_
->
size
(
1
)
/
2
}}),
1
,
tp_rank_
,
tp_size_
);
}
infinicore
::
nn
::
Parameter
GateUpParallelLinear
::
get_gate_weight_scale_awq
()
const
{
return
infinicore
::
nn
::
Parameter
(
weight_scale_
->
narrow
({{
1
,
0
,
weight_scale_
->
size
(
1
)
/
2
}}),
1
,
tp_rank_
,
tp_size_
);
}
infinicore
::
nn
::
Parameter
GateUpParallelLinear
::
get_up_weight_scale_awq
()
const
{
return
infinicore
::
nn
::
Parameter
(
weight_scale_
->
narrow
({{
1
,
weight_scale_
->
size
(
1
)
/
2
,
weight_scale_
->
size
(
1
)
/
2
}}),
1
,
tp_rank_
,
tp_size_
);
}
infinicore
::
nn
::
Parameter
GateUpParallelLinear
::
get_gate_weight_zeros_awq
()
const
{
return
infinicore
::
nn
::
Parameter
(
weight_zeros_
->
narrow
({{
1
,
0
,
weight_zeros_
->
size
(
1
)
/
2
}}),
1
,
tp_rank_
,
tp_size_
);
}
infinicore
::
nn
::
Parameter
GateUpParallelLinear
::
get_up_weight_zeros_awq
()
const
{
return
infinicore
::
nn
::
Parameter
(
weight_zeros_
->
narrow
({{
1
,
weight_zeros_
->
size
(
1
)
/
2
,
weight_zeros_
->
size
(
1
)
/
2
}}),
1
,
tp_rank_
,
tp_size_
);
}
}
// namespace infinilm::layers
}
// namespace infinilm::layers
csrc/layers/fused_linear.hpp
View file @
f692d681
...
@@ -58,6 +58,21 @@ public:
...
@@ -58,6 +58,21 @@ public:
infinicore
::
nn
::
Parameter
get_k_weight_zeros
()
const
;
infinicore
::
nn
::
Parameter
get_k_weight_zeros
()
const
;
infinicore
::
nn
::
Parameter
get_v_weight_zeros
()
const
;
infinicore
::
nn
::
Parameter
get_v_weight_zeros
()
const
;
// For computing the packing factor in awq quantization:
// Returns the number of low-bit elements packed into a single high-bit container element.
// For example: int4 → int32 yields a packing factor of 8 (32 bits / 4 bits = 8 int4 values per int32).
infinicore
::
nn
::
Parameter
get_q_weight_awq
(
int
scaling_factor
)
const
;
infinicore
::
nn
::
Parameter
get_k_weight_awq
(
int
scaling_factor
)
const
;
infinicore
::
nn
::
Parameter
get_v_weight_awq
(
int
scaling_factor
)
const
;
infinicore
::
nn
::
Parameter
get_q_weight_scale_awq
(
int
scaling_factor
)
const
;
infinicore
::
nn
::
Parameter
get_k_weight_scale_awq
(
int
scaling_factor
)
const
;
infinicore
::
nn
::
Parameter
get_v_weight_scale_awq
(
int
scaling_factor
)
const
;
infinicore
::
nn
::
Parameter
get_q_weight_zeros_awq
(
int
scaling_factor
)
const
;
infinicore
::
nn
::
Parameter
get_k_weight_zeros_awq
(
int
scaling_factor
)
const
;
infinicore
::
nn
::
Parameter
get_v_weight_zeros_awq
(
int
scaling_factor
)
const
;
infinicore
::
nn
::
Parameter
get_q_bias
()
const
;
infinicore
::
nn
::
Parameter
get_q_bias
()
const
;
infinicore
::
nn
::
Parameter
get_k_bias
()
const
;
infinicore
::
nn
::
Parameter
get_k_bias
()
const
;
infinicore
::
nn
::
Parameter
get_v_bias
()
const
;
infinicore
::
nn
::
Parameter
get_v_bias
()
const
;
...
@@ -132,6 +147,18 @@ public:
...
@@ -132,6 +147,18 @@ public:
infinicore
::
nn
::
Parameter
get_up_bias
()
const
;
infinicore
::
nn
::
Parameter
get_up_bias
()
const
;
infinicore
::
nn
::
Parameter
get_gate_weight_awq
()
const
;
infinicore
::
nn
::
Parameter
get_up_weight_awq
()
const
;
infinicore
::
nn
::
Parameter
get_up_weight_scale_awq
()
const
;
infinicore
::
nn
::
Parameter
get_up_weight_zeros_awq
()
const
;
infinicore
::
nn
::
Parameter
get_gate_weight_scale_awq
()
const
;
infinicore
::
nn
::
Parameter
get_gate_weight_zeros_awq
()
const
;
bool
has_gate_bias
()
const
;
bool
has_gate_bias
()
const
;
bool
has_up_bias
()
const
;
bool
has_up_bias
()
const
;
...
@@ -178,22 +205,24 @@ private:
...
@@ -178,22 +205,24 @@ private:
if (name##_->has_v_bias()) \
if (name##_->has_v_bias()) \
this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias());
this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias());
#define INFINILM_QKV_LINEAR_W4A16AWQ_INIT(name, q_name, k_name, v_name, ...) \
#define INFINILM_QKV_LINEAR_W4A16AWQ_INIT(name, q_name, k_name, v_name, ...) \
name##_ = std::make_shared<layers::QKVParallelLinear>(__VA_ARGS__); \
name##_ = std::make_shared<layers::QKVParallelLinear>(__VA_ARGS__); \
this->register_parameter(std::string(q_name) + ".qweight", name##_->get_q_weight()); \
auto awq_ptr = std::static_pointer_cast<infinicore::quantization::AWQ>(this->quantization_); \
this->register_parameter(std::string(q_name) + ".qzeros", name##_->get_q_weight_zeros()); \
int packing_num = awq_ptr->get_packing_num(); \
this->register_parameter(std::string(q_name) + ".scales", name##_->get_q_weight_scale()); \
this->register_parameter(std::string(q_name) + ".qweight", name##_->get_q_weight_awq(packing_num)); \
this->register_parameter(std::string(k_name) + ".qweight", name##_->get_k_weight()); \
this->register_parameter(std::string(q_name) + ".qzeros", name##_->get_q_weight_zeros_awq(packing_num)); \
this->register_parameter(std::string(k_name) + ".qzeros", name##_->get_k_weight_zeros()); \
this->register_parameter(std::string(q_name) + ".scales", name##_->get_q_weight_scale_awq(1)); \
this->register_parameter(std::string(k_name) + ".scales", name##_->get_k_weight_scale()); \
this->register_parameter(std::string(k_name) + ".qweight", name##_->get_k_weight_awq(packing_num)); \
this->register_parameter(std::string(v_name) + ".qweight", name##_->get_v_weight()); \
this->register_parameter(std::string(k_name) + ".qzeros", name##_->get_k_weight_zeros_awq(packing_num)); \
this->register_parameter(std::string(v_name) + ".qzeros", name##_->get_v_weight_zeros()); \
this->register_parameter(std::string(k_name) + ".scales", name##_->get_k_weight_scale_awq(1)); \
this->register_parameter(std::string(v_name) + ".scales", name##_->get_v_weight_scale()); \
this->register_parameter(std::string(v_name) + ".qweight", name##_->get_v_weight_awq(packing_num)); \
if (name##_->has_q_bias()) \
this->register_parameter(std::string(v_name) + ".qzeros", name##_->get_v_weight_zeros_awq(packing_num)); \
this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \
this->register_parameter(std::string(v_name) + ".scales", name##_->get_v_weight_scale_awq(1)); \
if (name##_->has_k_bias()) \
if (name##_->has_q_bias()) \
this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \
this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \
if (name##_->has_v_bias()) \
if (name##_->has_k_bias()) \
this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \
if (name##_->has_v_bias()) \
this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias());
this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias());
// ========================= Gate-Up Quantization ==============================
// ========================= Gate-Up Quantization ==============================
...
@@ -208,16 +237,16 @@ private:
...
@@ -208,16 +237,16 @@ private:
if (name##_->has_up_bias()) \
if (name##_->has_up_bias()) \
this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias());
this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias());
#define INFINILM_GATE_UP_LINEAR_W4A16AWQ_INIT(name, gate_name, up_name, ...) \
#define INFINILM_GATE_UP_LINEAR_W4A16AWQ_INIT(name, gate_name, up_name, ...)
\
name##_ = std::make_shared<layers::GateUpParallelLinear>(__VA_ARGS__); \
name##_ = std::make_shared<layers::GateUpParallelLinear>(__VA_ARGS__);
\
this->register_parameter(std::string(gate_name) + ".qweight", name##_->get_gate_weight()); \
this->register_parameter(std::string(gate_name) + ".qweight", name##_->get_gate_weight
_awq
()); \
this->register_parameter(std::string(gate_name) + ".
scale
s", name##_->get_gate_weight_
scale
()); \
this->register_parameter(std::string(gate_name) + ".
qzero
s", name##_->get_gate_weight_
zeros_awq
()); \
this->register_parameter(std::string(gate_name) + ".
qzero
s", name##_->get_gate_weight_
zeros
()); \
this->register_parameter(std::string(gate_name) + ".
scale
s", name##_->get_gate_weight_
scale_awq
()); \
this->register_parameter(std::string(up_name) + ".qweight", name##_->get_up_weight()); \
this->register_parameter(std::string(up_name) + ".qweight", name##_->get_up_weight
_awq
()); \
this->register_parameter(std::string(up_name) + ".
scale
s", name##_->get_up_weight_
scale
()); \
this->register_parameter(std::string(up_name) + ".
qzero
s", name##_->get_up_weight_
zeros_awq
()); \
this->register_parameter(std::string(up_name) + ".
qzero
s", name##_->get_up_weight_
zeros
()); \
this->register_parameter(std::string(up_name) + ".
scale
s", name##_->get_up_weight_
scale_awq
()); \
if (name##_->has_gate_bias()) \
if (name##_->has_gate_bias())
\
this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias()); \
this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias());
\
if (name##_->has_up_bias()) \
if (name##_->has_up_bias())
\
this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias());
this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias());
}
// namespace infinilm::layers
}
// namespace infinilm::layers
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment