Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
LLaMA-Factory
Commits
ca625f43
Commit
ca625f43
authored
Mar 30, 2026
by
shihm
Browse files
uodata
parent
7164651d
Changes
327
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1491 additions
and
0 deletions
+1491
-0
examples/inference/qwen3vl.yaml
examples/inference/qwen3vl.yaml
+4
-0
examples/ktransformers/infer_lora/deepseek2_lora_sft_kt.yaml
examples/ktransformers/infer_lora/deepseek2_lora_sft_kt.yaml
+10
-0
examples/ktransformers/infer_lora/deepseek3_kt.yaml
examples/ktransformers/infer_lora/deepseek3_kt.yaml
+9
-0
examples/ktransformers/infer_lora/deepseek3_lora_sft_kt.yaml
examples/ktransformers/infer_lora/deepseek3_lora_sft_kt.yaml
+10
-0
examples/ktransformers/infer_lora/qwen3moe_lora_sft_kt.yaml
examples/ktransformers/infer_lora/qwen3moe_lora_sft_kt.yaml
+10
-0
examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml
...nsformers/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml
+69
-0
examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat.yaml
...les/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat.yaml
+68
-0
examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml
...timize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml
+139
-0
examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
...mers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
+69
-0
examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml
...sformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml
+68
-0
examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml
...transformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml
+68
-0
examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml
...ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml
+77
-0
examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml
..._optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml
+392
-0
examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
...kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
+156
-0
examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml
...nsformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml
+77
-0
examples/ktransformers/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
...les/ktransformers/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
+80
-0
examples/ktransformers/train_lora/deepseek2_lora_sft_kt.yaml
examples/ktransformers/train_lora/deepseek2_lora_sft_kt.yaml
+52
-0
examples/ktransformers/train_lora/deepseek3_lora_sft_kt.yaml
examples/ktransformers/train_lora/deepseek3_lora_sft_kt.yaml
+52
-0
examples/ktransformers/train_lora/qwen3moe_lora_sft_kt.yaml
examples/ktransformers/train_lora/qwen3moe_lora_sft_kt.yaml
+52
-0
examples/megatron/qwen2_vl_full.yaml
examples/megatron/qwen2_vl_full.yaml
+29
-0
No files found.
examples/inference/qwen3vl.yaml
0 → 100644
View file @
ca625f43
model_name_or_path
:
Qwen/Qwen3-VL-4B-Instruct
template
:
qwen3_vl_nothink
infer_backend
:
huggingface
# choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code
:
true
examples/ktransformers/infer_lora/deepseek2_lora_sft_kt.yaml
0 → 100644
View file @
ca625f43
model_name_or_path
:
deepseek-ai/DeepSeek-V2-Lite
adapter_name_or_path
:
saves/Kllama_deepseekV2
template
:
deepseek
infer_backend
:
ktransformers
# choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code
:
true
use_kt
:
true
# use KTransformers as LoRA sft backend to inference
kt_optimize_rule
:
examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
cpu_infer
:
32
chunk_size
:
8192
examples/ktransformers/infer_lora/deepseek3_kt.yaml
0 → 100644
View file @
ca625f43
model_name_or_path
:
opensourcerelease/DeepSeek-V3-bf16
template
:
deepseek
infer_backend
:
ktransformers
# choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code
:
true
use_kt
:
true
# use KTransformers as LoRA sft backend to inference
kt_optimize_rule
:
examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
cpu_infer
:
32
chunk_size
:
8192
examples/ktransformers/infer_lora/deepseek3_lora_sft_kt.yaml
0 → 100644
View file @
ca625f43
model_name_or_path
:
opensourcerelease/DeepSeek-V3-bf16
adapter_name_or_path
:
saves/Kllama_deepseekV3
template
:
deepseek
infer_backend
:
ktransformers
# choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code
:
true
use_kt
:
true
# use KTransformers as LoRA sft backend to inference
kt_optimize_rule
:
examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
cpu_infer
:
32
chunk_size
:
8192
examples/ktransformers/infer_lora/qwen3moe_lora_sft_kt.yaml
0 → 100644
View file @
ca625f43
model_name_or_path
:
Qwen/Qwen3-235B-A22B-Instruct-2507
adapter_name_or_path
:
saves/Kllama_Qwen3MoE_235bA22b
template
:
qwen3_nothink
infer_backend
:
ktransformers
# choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code
:
true
use_kt
:
true
# use KTransformers as LoRA sft backend to inference
kt_optimize_rule
:
examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
cpu_infer
:
32
chunk_size
:
8192
examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml
0 → 100644
View file @
ca625f43
-
match
:
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV2MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KSFTExpertsCPU"
out_device
:
"
cuda"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat.yaml
0 → 100644
View file @
ca625f43
-
match
:
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV2MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml
0 → 100644
View file @
ca625f43
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV2MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV2MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:0"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KSFTExpertsCPU"
out_device
:
"
cuda:0"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:1"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KSFTExpertsCPU"
out_device
:
"
cuda:1"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
transfer_map
:
10
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([12][0-9])
\\
.)|(model.norm)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
0 → 100644
View file @
ca625f43
-
match
:
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV2MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cpu"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KSFTExpertsCPU"
out_device
:
"
cuda"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml
0 → 100644
View file @
ca625f43
-
match
:
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV2MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cpu"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KSFTExpertsCPU"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml
0 → 100644
View file @
ca625f43
-
match
:
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV2MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml
0 → 100644
View file @
ca625f43
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^lm_head$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
absorb_for_prefill
:
False
# change this to True to enable long context(prefill may slower).
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml
0 → 100644
View file @
ca625f43
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
# === Rotary Embedding Replacement ===
# GPU 0: layers 0–14
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9]|1[0-4])
\\
."
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
# GPU 1: layers 15–29
-
match
:
name
:
"
^model
\\
.layers
\\
.(1[5-9]|2[0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
# GPU 2: layers 30–44
-
match
:
name
:
"
^model
\\
.layers
\\
.(3[0-9]|4[0-4])
\\
."
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
# GPU 3: layers 45–60
-
match
:
name
:
"
^model
\\
.layers
\\
.(4[5-9]|5[0-9]|60)
\\
."
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
# === Linear Layers Replacement (excluding self_attn.kv_b_proj) ===
# GPU 0: layers 0–14
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9]|1[0-4])
\\
.(?!self_attn
\\
.kv_b_proj).*$"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
# GPU 1: layers 15–29
-
match
:
name
:
"
^model
\\
.layers
\\
.(1[5-9]|2[0-9])
\\
.(?!self_attn
\\
.kv_b_proj).*$"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
# GPU 2: layers 30–44
-
match
:
name
:
"
^model
\\
.layers
\\
.(3[0-9]|4[0-4])
\\
.(?!self_attn
\\
.kv_b_proj).*$"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
# GPU 3: layers 45–60
-
match
:
name
:
"
^model
\\
.layers
\\
.(4[5-9]|5[0-9]|60)
\\
.(?!self_attn
\\
.kv_b_proj).*$"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
# === MLP (MoE) Replacement ===
# GPU 0: layers 0–14
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9]|1[0-4])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
# GPU 1: layers 15–29
-
match
:
name
:
"
^model
\\
.layers
\\
.(1[5-9]|2[0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
# GPU 2: layers 30–44
-
match
:
name
:
"
^model
\\
.layers
\\
.(3[0-9]|4[0-4])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
# GPU 3: layers 45–60
-
match
:
name
:
"
^model
\\
.layers
\\
.(4[5-9]|5[0-9]|60)
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
# === MLP Gate Replacement ===
# GPU 0: layers 0–14
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9]|1[0-4])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
# GPU 1: layers 15–29
-
match
:
name
:
"
^model
\\
.layers
\\
.(1[5-9]|2[0-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
# GPU 2: layers 30–44
-
match
:
name
:
"
^model
\\
.layers
\\
.(3[0-9]|4[0-4])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
# GPU 3: layers 45–60
-
match
:
name
:
"
^model
\\
.layers
\\
.(4[5-9]|5[0-9]|60)
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
# === MLP Experts Replacement ===
# replace with marlin expert. Open and modify layer-num as needed.
# Each layer of malin experts takes about 6GB of GPU memory.
# !!!Do remember 'close' cuda graph if you are using marlin expert.!!!
# !!!KExpertsTorch is untested, we don't have enough VRAM.!!!
# GPU 0: layers 3–4
# - match:
# name: "^model\\.layers\\.([3-4])\\.mlp\\.experts$"
# replace:
# class: ktransformers.operators.experts.KTransformersExperts
# kwargs:
# generate_device: "cuda:0"
# generate_op: "KExpertsMarlin"
# recursive: False
# # GPU 1: layers 15–17
# - match:
# name: "^model\\.layers\\.(1[5-7])\\.mlp\\.experts$"
# replace:
# class: ktransformers.operators.experts.KTransformersExperts
# kwargs:
# generate_device: "cuda:1"
# generate_op: "KExpertsMarlin"
# recursive: False
# # GPU 2: layers 30–32
# - match:
# name: "^model\\.layers\\.(3[0-2])\\.mlp\\.experts$"
# replace:
# class: ktransformers.operators.experts.KTransformersExperts
# kwargs:
# generate_device: "cuda:2"
# generate_op: "KExpertsMarlin"
# recursive: False
# # GPU 3: layers 45–46
# - match:
# name: "^model\\.layers\\.(4[5-6])\\.mlp\\.experts$"
# replace:
# class: ktransformers.operators.experts.KTransformersExperts
# kwargs:
# generate_device: "cuda:3"
# generate_op: "KExpertsMarlin"
# recursive: False
# === MLP Experts Replacement ===
# GPU 0: layers 0–14
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9]|1[0-4])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
kwargs
:
prefill_device
:
"
cuda:0"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KSFTExpertsCPU"
out_device
:
"
cuda:0"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# GPU 1: layers 15–29
-
match
:
name
:
"
^model
\\
.layers
\\
.(1[5-9]|2[0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
kwargs
:
prefill_device
:
"
cuda:1"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KSFTExpertsCPU"
out_device
:
"
cuda:1"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# GPU 2: layers 30–44
-
match
:
name
:
"
^model
\\
.layers
\\
.(3[0-9]|4[0-4])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
kwargs
:
prefill_device
:
"
cuda:2"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KSFTExpertsCPU"
out_device
:
"
cuda:2"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# GPU 3: layers 45–60
-
match
:
name
:
"
^model
\\
.layers
\\
.(4[5-9]|5[0-9]|60)
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
kwargs
:
prefill_device
:
"
cuda:3"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KSFTExpertsCPU"
out_device
:
"
cuda:3"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# === Self-Attention Replacement ===
# GPU 0: layers 0–14
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9]|1[0-4])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
absorb_for_prefill
:
False
# GPU 1: layers 15–29
-
match
:
name
:
"
^model
\\
.layers
\\
.(1[5-9]|2[0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
absorb_for_prefill
:
False
# GPU 2: layers 30–44
-
match
:
name
:
"
^model
\\
.layers
\\
.(3[0-9]|4[0-4])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
absorb_for_prefill
:
False
# GPU 3: layers 45–60
-
match
:
name
:
"
^model
\\
.layers
\\
.(4[5-9]|5[0-9]|60)
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
absorb_for_prefill
:
False
# === Overall Model Replacement with Transfer Map ===
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 means close layer‐wise prefill
transfer_map
:
15
:
"
cuda:1"
# Layers 15+ on GPU 1
30
:
"
cuda:2"
# Layers 30+ on GPU 2
45
:
"
cuda:3"
# Layers 45+ on GPU 3
# === Default Catch-All for Other Modules ===
# GPU 0: layers 0–14
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9]|1[0-4])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
# GPU 1: layers 15–29
-
match
:
name
:
"
^model
\\
.layers
\\
.(1[5-9]|2[0-9])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
# GPU 2: layers 30–44
-
match
:
name
:
"
^model
\\
.layers
\\
.(3[0-9]|4[0-4])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
# For final modules (model.norm), ensure they are on GPU 3 (as in your original config)
-
match
:
name
:
"
(^model
\\
.layers
\\
.(4[5-9]|5[0-9]|60)
\\
.)|(^model
\\
.norm)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
0 → 100644
View file @
ca625f43
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.(?!self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.(?!self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:0"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KSFTExpertsCPU"
out_device
:
"
cuda:0"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:1"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KSFTExpertsCPU"
out_device
:
"
cuda:1"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
transfer_map
:
30
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([3456][0-9])
\\
.)|(model.norm)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml
0 → 100644
View file @
ca625f43
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^lm_head$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KSFTExpertsCPU"
out_device
:
"
cuda"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
absorb_for_prefill
:
False
# change this to True to enable long context(prefill may slower).
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
examples/ktransformers/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
0 → 100644
View file @
ca625f43
-
match
:
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.RotaryEmbedding
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^lm_head$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
# - match:
# name: "^model\\.layers\\..*$" # regular expression
# class: torch.nn.Linear # only match modules matching name and class simultaneously
# replace:
# class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
# kwargs:
# generate_device: "cuda"
# prefill_device: "cuda"
# generate_op: "KLinearTorch"
# prefill_op: "KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*mlp
\\
.shared_expert_gate).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
replace
:
class
:
ktransformers.operators.experts.KQwen3MoeSparseMoeBlock
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KSFTExpertsCPU"
out_device
:
"
cuda"
backend
:
"
AMXInt8"
# or "AMXBF16" or "AMXInt8"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KQwen3MoeAttention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KQwen3MoeModel"
kwargs
:
per_layer_prefill_intput_threshold
:
0
examples/ktransformers/train_lora/deepseek2_lora_sft_kt.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
deepseek-ai/DeepSeek-V2-Lite
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
### dataset
dataset
:
identity
template
:
deepseek
cutoff_len
:
2048
max_samples
:
100000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/Kllama_deepseekV2
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### ktransformers
use_kt
:
true
# use KTransformers as LoRA sft backend
kt_optimize_rule
:
examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
cpu_infer
:
32
chunk_size
:
8192
### eval
# eval_dataset: alpaca_en_demo
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/ktransformers/train_lora/deepseek3_lora_sft_kt.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
opensourcerelease/DeepSeek-V3-bf16
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
### dataset
dataset
:
identity
template
:
deepseek
cutoff_len
:
2048
max_samples
:
100000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/Kllama_deepseekV3
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### ktransformers
use_kt
:
true
# use KTransformers as LoRA sft backend
kt_optimize_rule
:
examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
cpu_infer
:
32
chunk_size
:
8192
### eval
# eval_dataset: alpaca_en_demo
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/ktransformers/train_lora/qwen3moe_lora_sft_kt.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-235B-A22B-Instruct-2507
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
### dataset
dataset
:
identity, alpaca_en_demo
template
:
qwen3_nothink
cutoff_len
:
2048
max_samples
:
100000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/Kllama_Qwen3MoE_235bA22b
logging_steps
:
10
save_steps
:
200
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### ktransformers
use_kt
:
true
# use KTransformers as LoRA sft backend
kt_optimize_rule
:
examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
cpu_infer
:
32
chunk_size
:
8192
### eval
# eval_dataset: alpaca_en_demo
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/megatron/qwen2_vl_full.yaml
0 → 100644
View file @
ca625f43
model_name_or_path
:
Qwen/Qwen2-VL-7B-Instruct
image_max_pixels
:
262144
video_max_pixels
:
16384
do_train
:
true
stage
:
sft
finetuning_type
:
full
# only support full for now
dataset
:
llava_1k_en
preprocessing_num_workers
:
8
cutoff_len
:
4096
template
:
qwen2_vl
output_dir
:
saves/mca/qwen2_vl_full
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
2
num_train_epochs
:
2
learning_rate
:
2e-5
logging_steps
:
1
save_steps
:
100
lr_scheduler_type
:
cosine
bf16
:
true
# mcore speed up
tensor_model_parallel_size
:
4
sequence_parallel
:
true
pipeline_model_parallel_size
:
2
bias_activation_fusion
:
true
apply_rope_fusion
:
true
use_distributed_optimizer
:
true
Prev
1
2
3
4
5
6
7
8
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment