Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ox696c
ktransformers
Commits
33cbd470
Commit
33cbd470
authored
Apr 28, 2025
by
djw
Browse files
support qwen3
parent
68c2b2e6
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
195 additions
and
7 deletions
+195
-7
csrc/ktransformers_ext/CMakeLists.txt
csrc/ktransformers_ext/CMakeLists.txt
+0
-1
ktransformers/optimize/optimize_rules/Qwen2-serve-amx.yaml
ktransformers/optimize/optimize_rules/Qwen2-serve-amx.yaml
+96
-0
ktransformers/optimize/optimize_rules/Qwen2-serve.yaml
ktransformers/optimize/optimize_rules/Qwen2-serve.yaml
+0
-1
ktransformers/optimize/optimize_rules/Qwen3Moe-serve-amx.yaml
...nsformers/optimize/optimize_rules/Qwen3Moe-serve-amx.yaml
+96
-0
ktransformers/optimize/optimize_rules/Qwen3Moe-serve.yaml
ktransformers/optimize/optimize_rules/Qwen3Moe-serve.yaml
+0
-1
ktransformers/server/args.py
ktransformers/server/args.py
+0
-1
ktransformers/server/backend/interfaces/balance_serve.py
ktransformers/server/backend/interfaces/balance_serve.py
+2
-2
ktransformers/server/balance_serve/inference/model_runner.py
ktransformers/server/balance_serve/inference/model_runner.py
+1
-1
No files found.
csrc/ktransformers_ext/CMakeLists.txt
View file @
33cbd470
...
...
@@ -151,7 +151,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
message
(
STATUS
"Compiler and/or CPU do NOT support AVX512F"
)
set
(
HAS_AVX512 False
)
endif
()
set
(
HAS_AVX512 False
)
# check AMX
string
(
FIND
"
${
LSCPU_OUTPUT
}
"
"amx"
COMPILER_SUPPORTS_AMX
)
...
...
ktransformers/optimize/optimize_rules/Qwen2-serve-amx.yaml
0 → 100644
View file @
33cbd470
-
match
:
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.RotaryEmbedding
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^lm_head$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
# - match:
# name: "^model\\.layers\\..*$" # regular expression
# class: torch.nn.Linear # only match modules matching name and class simultaneously
# replace:
# class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
# kwargs:
# generate_device: "cuda"
# prefill_device: "cuda"
# generate_op: "VLinearMarlin"
# prefill_op: "KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*mlp
\\
.shared_expert_gate).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
VLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
replace
:
class
:
ktransformers.operators.experts.KQwen2MoeSparseMoeBlockV2
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExpertsV2
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.balance_serve_attention.KQwen2MoeAttention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KQwen2MoeModel"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeRMSNorm
replace
:
class
:
ktransformers.operators.layernorm.KQwen2MoeRMSNorm
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeMLP
replace
:
class
:
ktransformers.operators.mlp.KQwen2MoeMLP
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
\ No newline at end of file
ktransformers/optimize/optimize_rules/Qwen2-serve.yaml
View file @
33cbd470
...
...
@@ -56,7 +56,6 @@
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
...
...
ktransformers/optimize/optimize_rules/Qwen3Moe-serve-amx.yaml
0 → 100644
View file @
33cbd470
-
match
:
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.RotaryEmbedding
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^lm_head$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
VLinearMarlin"
prefill_op
:
"
KLinearTorch"
# - match:
# name: "^model\\.layers\\..*$" # regular expression
# class: torch.nn.Linear # only match modules matching name and class simultaneously
# replace:
# class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
# kwargs:
# generate_device: "cuda"
# prefill_device: "cuda"
# generate_op: "VLinearMarlin"
# prefill_op: "KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*mlp
\\
.shared_expert_gate).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock
replace
:
class
:
ktransformers.operators.experts.KQwen3MoeSparseMoeBlockV2
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExpertsV2
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda"
backend
:
"
AMXBF16"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.balance_serve_attention.KQwen3MoeAttention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KQwen2MoeModel"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
class
:
ktransformers.models.modeling_qwen3_moe.Qwen3MoeRMSNorm
replace
:
class
:
ktransformers.operators.layernorm.KQwen3MoeRMSNorm
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
class
:
ktransformers.models.modeling_qwen3_moe.Qwen3MoeMLP
replace
:
class
:
ktransformers.operators.mlp.KQwen2MoeMLP
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
\ No newline at end of file
ktransformers/optimize/optimize_rules/Qwen3Moe-serve.yaml
View file @
33cbd470
...
...
@@ -56,7 +56,6 @@
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda"
backend
:
"
AMXBF16"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
...
...
ktransformers/server/args.py
View file @
33cbd470
...
...
@@ -94,7 +94,6 @@ class ArgumentParser:
parser
.
add_argument
(
"--user_algorithm"
,
type
=
str
,
default
=
self
.
cfg
.
user_algorithm
)
parser
.
add_argument
(
"--force_think"
,
action
=
argparse
.
BooleanOptionalAction
,
type
=
bool
,
default
=
self
.
cfg
.
user_force_think
)
parser
.
add_argument
(
"--use_cuda_graph"
,
action
=
argparse
.
BooleanOptionalAction
,
type
=
bool
,
default
=
self
.
cfg
.
use_cuda_graph
)
# parser.add_argument("--use_cuda_graph", action=argparse.BooleanOptionalAction, type=bool, default=False)
# web config
parser
.
add_argument
(
"--web_cross_domain"
,
type
=
bool
,
default
=
self
.
cfg
.
web_cross_domain
)
...
...
ktransformers/server/backend/interfaces/balance_serve.py
View file @
33cbd470
...
...
@@ -56,8 +56,8 @@ ktransformer_rules_dir = (
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
".."
,
".."
,
".."
,
"./optimize/optimize_rules/"
)
)
default_optimize_rules
=
{
"DeepseekV3ForCausalLM"
:
ktransformer_rules_dir
+
"Moonlight-16B-A3B-serve.yaml"
,
#
"DeepseekV3ForCausalLM": ktransformer_rules_dir + "DeepSeek-V3-Chat-serve.yaml",
#
"DeepseekV3ForCausalLM": ktransformer_rules_dir + "Moonlight-16B-A3B-serve.yaml",
"DeepseekV3ForCausalLM"
:
ktransformer_rules_dir
+
"DeepSeek-V3-Chat-serve.yaml"
,
"Qwen2MoeForCausalLM"
:
ktransformer_rules_dir
+
"Qwen2-serve.yaml"
,
"Qwen3MoeForCausalLM"
:
ktransformer_rules_dir
+
"Qwen3Moe-serve.yaml"
,
}
...
...
ktransformers/server/balance_serve/inference/model_runner.py
View file @
33cbd470
...
...
@@ -85,7 +85,7 @@ class ModelRunner:
elif
isinstance
(
self
.
model
,
KQwen2MoeForCausalLM
)
or
isinstance
(
self
.
model
,
KQwen3MoeForCausalLM
):
self
.
model
.
flash_infer_attn_plan
(
batch
,
self
.
bsz_tensor_buf
,
self
.
num_tokens_tensor_buf
,
num_q_heads
=
self
.
model
.
config
.
num_attention_heads
,
num_kv_heads
=
self
.
model
.
config
.
num_key_value_heads
,
head_dim
=
128
,
head_dim
=
self
.
model
.
config
.
head_dim
if
hasattr
(
self
.
model
.
config
,
'head_num'
)
else
self
.
model
.
config
.
hidden_size
//
self
.
model
.
config
.
num_attention_heads
,
page_size
=
self
.
model
.
cache
.
page_size
,
causal
=
True
,
q_data_type
=
torch
.
bfloat16
,
kv_data_type
=
torch
.
bfloat16
,
cuda_graph_idx
=
cuda_graph_idx
)
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment