Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ox696c
ktransformers
Commits
3f9bbf11
Commit
3f9bbf11
authored
Apr 28, 2025
by
djw
Browse files
support qwen3, dont speak human language
parent
f3d842a0
Changes
30
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
227 additions
and
185 deletions
+227
-185
ktransformers/server/balance_serve/inference/forward_batch.py
...nsformers/server/balance_serve/inference/forward_batch.py
+1
-1
ktransformers/server/balance_serve/inference/model_runner.py
ktransformers/server/balance_serve/inference/model_runner.py
+96
-175
ktransformers/server/balance_serve/sched_rpc.py
ktransformers/server/balance_serve/sched_rpc.py
+7
-2
ktransformers/server/balance_serve/settings.py
ktransformers/server/balance_serve/settings.py
+106
-0
ktransformers/server/config/config.py
ktransformers/server/config/config.py
+1
-0
ktransformers/server/requirements.txt
ktransformers/server/requirements.txt
+1
-1
ktransformers/util/custom_gguf.py
ktransformers/util/custom_gguf.py
+12
-3
pyproject.toml
pyproject.toml
+1
-1
requirements-local_chat.txt
requirements-local_chat.txt
+1
-1
third_party/custom_flashinfer
third_party/custom_flashinfer
+1
-1
No files found.
ktransformers/server/balance_serve/inference/forward_batch.py
View file @
3f9bbf11
...
@@ -281,4 +281,4 @@ class ForwardBatchOutput:
...
@@ -281,4 +281,4 @@ class ForwardBatchOutput:
self
.
generated_tokens_num
=
[]
self
.
generated_tokens_num
=
[]
self
.
top_ps
=
[]
self
.
top_ps
=
[]
self
.
temperatures
=
[]
self
.
temperatures
=
[]
pass
self
.
num_batchs
=
1
\ No newline at end of file
\ No newline at end of file
ktransformers/server/balance_serve/inference/model_runner.py
View file @
3f9bbf11
This diff is collapsed.
Click to expand it.
ktransformers/server/balance_serve/sched_rpc.py
View file @
3f9bbf11
...
@@ -10,7 +10,7 @@ current_file_path = os.path.abspath(__file__)
...
@@ -10,7 +10,7 @@ current_file_path = os.path.abspath(__file__)
# sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
# sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
import
pickle
import
pickle
import
argparse
import
argparse
from
ktransformers.server.balance_serve.settings
import
sched_ext
,
create_sched_settings
from
ktransformers.server.balance_serve.settings
import
sched_ext
,
create_sched_settings
,
create_sched_settings_qwen2moe
,
create_sched_settings_qwen3moe
...
@@ -209,5 +209,10 @@ if __name__ == '__main__':
...
@@ -209,5 +209,10 @@ if __name__ == '__main__':
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
with
open
(
args
.
config
,
"rb"
)
as
f
:
with
open
(
args
.
config
,
"rb"
)
as
f
:
main_args
=
pickle
.
load
(
f
)
main_args
=
pickle
.
load
(
f
)
settings
=
create_sched_settings
(
main_args
)
if
main_args
.
architectures
==
"Qwen2MoeForCausalLM"
:
settings
=
create_sched_settings_qwen2moe
(
main_args
)
elif
main_args
.
architectures
==
"Qwen3MoeForCausalLM"
:
settings
=
create_sched_settings_qwen3moe
(
main_args
)
else
:
settings
=
create_sched_settings
(
main_args
)
start_server
(
settings
,
main_args
)
start_server
(
settings
,
main_args
)
ktransformers/server/balance_serve/settings.py
View file @
3f9bbf11
...
@@ -11,6 +11,8 @@ from time import sleep
...
@@ -11,6 +11,8 @@ from time import sleep
import
sched_ext
import
sched_ext
from
transformers
import
AutoConfig
from
transformers
import
AutoConfig
from
ktransformers.models.configuration_qwen3_moe
import
Qwen3MoeConfig
def
create_sched_settings
(
args
):
def
create_sched_settings
(
args
):
default_sample_options
=
sched_ext
.
SampleOptions
()
default_sample_options
=
sched_ext
.
SampleOptions
()
model_name
=
os
.
path
.
basename
(
os
.
path
.
normpath
(
args
.
model_dir
))
model_name
=
os
.
path
.
basename
(
os
.
path
.
normpath
(
args
.
model_dir
))
...
@@ -64,7 +66,111 @@ def create_sched_settings(args):
...
@@ -64,7 +66,111 @@ def create_sched_settings(args):
return
settings
return
settings
def
create_sched_settings_qwen2moe
(
args
):
default_sample_options
=
sched_ext
.
SampleOptions
()
model_name
=
os
.
path
.
basename
(
os
.
path
.
normpath
(
args
.
model_dir
))
input_model_settings
=
sched_ext
.
ModelSettings
()
input_model_settings
.
model_path
=
args
.
model_dir
input_model_settings
.
params_count
=
int
(
0
)
model_config
=
AutoConfig
.
from_pretrained
(
args
.
model_dir
,
trust_remote_code
=
True
)
input_model_settings
.
layer_count
=
model_config
.
num_hidden_layers
input_model_settings
.
num_k_heads
=
model_config
.
num_key_value_heads
# model_config["num_key_value_heads"]
input_model_settings
.
k_head_dim
=
128
input_model_settings
.
bytes_per_params
=
2
input_model_settings
.
bytes_per_kv_cache_element
=
2
settings
=
sched_ext
.
Settings
()
settings
.
model_name
=
model_name
settings
.
quant_type
=
"BF16"
settings
.
model_settings
=
input_model_settings
settings
.
page_size
=
args
.
page_size
settings
.
gpu_device_count
=
1
# tp
settings
.
gpu_device_id
=
[
i
for
i
in
range
(
settings
.
gpu_device_count
)]
# settings.gpu_memory_size = args.cache_lens*576*2
settings
.
gpu_memory_size
=
args
.
gpu_memory_size
settings
.
memory_utilization_percentage
=
args
.
utilization_percentage
max_batch_size
=
args
.
max_batch_size
chunk_size
=
args
.
chunk_size
max_decode_batch_size
=
max_batch_size
-
2
settings
.
max_batch_size
=
max_batch_size
settings
.
recommended_chunk_prefill_token_count
=
(
chunk_size
-
max_decode_batch_size
)
//
2
settings
.
sample_options
=
default_sample_options
settings
.
sched_metrics_port
=
args
.
sched_metrics_port
settings
.
gpu_only
=
args
.
memory_gpu_only
settings
.
use_self_defined_head_dim
=
False
settings
.
self_defined_head_dim
=
576
settings
.
full_kv_cache_on_each_gpu
=
True
settings
.
k_cache_on
=
True
settings
.
v_cache_on
=
True
settings
.
kvc2_root_path
=
'/mnt/data/persist-kvc'
settings
.
kvc2_config_path
=
args
.
kvc2_config_dir
settings
.
memory_pool_size_GB
=
args
.
cpu_memory_size_GB
settings
.
evict_count
=
40
settings
.
kvc2_metrics_port
=
args
.
kvc2_metrics_port
settings
.
load_from_disk
=
False
settings
.
save_to_disk
=
True
settings
.
strategy_name
=
args
.
sched_strategy
settings
.
auto_derive
()
return
settings
def
create_sched_settings_qwen3moe
(
args
):
default_sample_options
=
sched_ext
.
SampleOptions
()
model_name
=
os
.
path
.
basename
(
os
.
path
.
normpath
(
args
.
model_dir
))
input_model_settings
=
sched_ext
.
ModelSettings
()
input_model_settings
.
model_path
=
args
.
model_dir
input_model_settings
.
params_count
=
int
(
0
)
model_config
=
Qwen3MoeConfig
.
from_pretrained
(
args
.
model_dir
,
trust_remote_code
=
True
)
input_model_settings
.
layer_count
=
model_config
.
num_hidden_layers
input_model_settings
.
num_k_heads
=
model_config
.
num_key_value_heads
# model_config["num_key_value_heads"]
input_model_settings
.
k_head_dim
=
128
input_model_settings
.
bytes_per_params
=
2
input_model_settings
.
bytes_per_kv_cache_element
=
2
settings
=
sched_ext
.
Settings
()
settings
.
model_name
=
model_name
settings
.
quant_type
=
"BF16"
settings
.
model_settings
=
input_model_settings
settings
.
page_size
=
args
.
page_size
settings
.
gpu_device_count
=
1
# tp
settings
.
gpu_device_id
=
[
i
for
i
in
range
(
settings
.
gpu_device_count
)]
# settings.gpu_memory_size = args.cache_lens*576*2
settings
.
gpu_memory_size
=
args
.
gpu_memory_size
settings
.
memory_utilization_percentage
=
args
.
utilization_percentage
max_batch_size
=
args
.
max_batch_size
chunk_size
=
args
.
chunk_size
max_decode_batch_size
=
max_batch_size
-
2
settings
.
max_batch_size
=
max_batch_size
settings
.
recommended_chunk_prefill_token_count
=
(
chunk_size
-
max_decode_batch_size
)
//
2
settings
.
sample_options
=
default_sample_options
settings
.
sched_metrics_port
=
args
.
sched_metrics_port
settings
.
gpu_only
=
args
.
memory_gpu_only
settings
.
use_self_defined_head_dim
=
False
settings
.
self_defined_head_dim
=
576
settings
.
full_kv_cache_on_each_gpu
=
True
settings
.
k_cache_on
=
True
settings
.
v_cache_on
=
True
settings
.
kvc2_root_path
=
'/mnt/data/persist-kvc'
settings
.
kvc2_config_path
=
args
.
kvc2_config_dir
settings
.
memory_pool_size_GB
=
args
.
cpu_memory_size_GB
settings
.
evict_count
=
40
settings
.
kvc2_metrics_port
=
args
.
kvc2_metrics_port
settings
.
load_from_disk
=
False
settings
.
save_to_disk
=
True
settings
.
strategy_name
=
args
.
sched_strategy
settings
.
auto_derive
()
return
settings
...
...
ktransformers/server/config/config.py
View file @
3f9bbf11
...
@@ -100,6 +100,7 @@ class Config(metaclass=Singleton):
...
@@ -100,6 +100,7 @@ class Config(metaclass=Singleton):
# to make sure it consistent with previous version
# to make sure it consistent with previous version
self
.
model_path
:
str
=
self
.
model_dir
self
.
model_path
:
str
=
self
.
model_dir
self
.
model_name
:
str
=
self
.
model
.
get
(
"name"
,
""
)
self
.
model_name
:
str
=
self
.
model
.
get
(
"name"
,
""
)
self
.
architectures
:
str
=
self
.
model
.
get
(
"name"
,
""
)
self
.
model_device
:
str
=
self
.
model
.
get
(
"device"
,
"cuda:0"
)
self
.
model_device
:
str
=
self
.
model
.
get
(
"device"
,
"cuda:0"
)
self
.
gguf_path
:
Optional
[
str
]
=
self
.
model
.
get
(
"gguf_path"
,
None
)
self
.
gguf_path
:
Optional
[
str
]
=
self
.
model
.
get
(
"gguf_path"
,
None
)
self
.
use_cuda_graph
=
self
.
model
.
get
(
"use_cuda_graph"
,
True
)
self
.
use_cuda_graph
=
self
.
model
.
get
(
"use_cuda_graph"
,
True
)
...
...
ktransformers/server/requirements.txt
View file @
3f9bbf11
torch >= 2.3.0
torch >= 2.3.0
transformers == 4.
43.2
transformers == 4.
51.3
fastapi >= 0.111.0
fastapi >= 0.111.0
langchain >= 0.2.0
langchain >= 0.2.0
blessed >= 1.20.0
blessed >= 1.20.0
...
...
ktransformers/util/custom_gguf.py
View file @
3f9bbf11
...
@@ -912,6 +912,9 @@ def translate_name_to_gguf(name):
...
@@ -912,6 +912,9 @@ def translate_name_to_gguf(name):
name
=
name
.
replace
(
".self_attn.q_a_proj"
,
".attn_q_a"
)
name
=
name
.
replace
(
".self_attn.q_a_proj"
,
".attn_q_a"
)
name
=
name
.
replace
(
".self_attn.q_a_layernorm"
,
".attn_q_a_norm"
)
name
=
name
.
replace
(
".self_attn.q_a_layernorm"
,
".attn_q_a_norm"
)
name
=
name
.
replace
(
".self_attn.q_b_proj"
,
".attn_q_b"
)
name
=
name
.
replace
(
".self_attn.q_b_proj"
,
".attn_q_b"
)
name
=
name
.
replace
(
".self_attn.q_norm"
,
".attn_q_norm"
)
name
=
name
.
replace
(
".self_attn.k_norm"
,
".attn_k_norm"
)
name
=
name
.
replace
(
".shared_expert."
,
".shared_experts."
)
name
=
name
.
replace
(
".shared_expert."
,
".shared_experts."
)
name
=
name
.
replace
(
".shared_expert_"
,
".shared_experts_"
)
name
=
name
.
replace
(
".shared_expert_"
,
".shared_experts_"
)
...
@@ -922,17 +925,23 @@ def translate_name_to_gguf(name):
...
@@ -922,17 +925,23 @@ def translate_name_to_gguf(name):
name
=
name
.
replace
(
".mlp.shared_experts.gate_proj"
,
".ffn_gate_shexp"
)
name
=
name
.
replace
(
".mlp.shared_experts.gate_proj"
,
".ffn_gate_shexp"
)
name
=
name
.
replace
(
".mlp.shared_experts.up_proj"
,
".ffn_up_shexp"
)
name
=
name
.
replace
(
".mlp.shared_experts.up_proj"
,
".ffn_up_shexp"
)
name
=
name
.
replace
(
".mlp.shared_experts_gate"
,
".ffn_gate_inp_shexp"
)
name
=
name
.
replace
(
".mlp.shared_experts_gate"
,
".ffn_gate_inp_shexp"
)
name
=
name
.
replace
(
".mlp.experts"
,
""
)
name
=
name
.
replace
(
".mlp.experts"
,
""
)
name
=
name
.
replace
(
".mlp.experts.ffn_down_exps"
,
".ffn_down_exps"
)
name
=
name
.
replace
(
".mlp.experts.ffn_gate_exps"
,
".ffn_gate_exps"
)
name
=
name
.
replace
(
".mlp.experts.ffn_up_exps"
,
".ffn_up_exps"
)
name
=
name
.
replace
(
".block_sparse_moe.gate."
,
".ffn_gate_inp."
)
name
=
name
.
replace
(
".block_sparse_moe.gate."
,
".ffn_gate_inp."
)
name
=
name
.
replace
(
".block_sparse_moe.experts"
,
""
)
name
=
name
.
replace
(
".block_sparse_moe.experts"
,
""
)
name
=
name
.
replace
(
".feed_forward.experts"
,
""
)
name
=
name
.
replace
(
".feed_forward.router"
,
".ffn_gate_inp"
)
name
=
name
.
replace
(
".feed_forward.shared_experts.down_proj"
,
".ffn_down_shexp"
)
name
=
name
.
replace
(
".feed_forward.shared_experts.gate_proj"
,
".ffn_gate_shexp"
)
name
=
name
.
replace
(
".feed_forward.shared_experts.up_proj"
,
".ffn_up_shexp"
)
return
name
return
name
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
gguf_path
=
'/mnt/data/model/DeepSeek-Coder-V2-GGUF-WJH'
gguf_path
=
'/mnt/data/model/DeepSeek-Coder-V2-GGUF-WJH'
loader
=
GGUFLoader
(
gguf_path
)
loader
=
GGUFLoader
(
gguf_path
)
...
...
pyproject.toml
View file @
3f9bbf11
...
@@ -16,7 +16,7 @@ dynamic = ["version"]
...
@@ -16,7 +16,7 @@ dynamic = ["version"]
dependencies
=
[
dependencies
=
[
"torch >= 2.3.0"
,
"torch >= 2.3.0"
,
"transformers == 4.
43.2
"
,
"transformers == 4.
51.3
"
,
"fastapi >= 0.111.0"
,
"fastapi >= 0.111.0"
,
"uvicorn >= 0.30.1"
,
"uvicorn >= 0.30.1"
,
"langchain >= 0.2.0"
,
"langchain >= 0.2.0"
,
...
...
requirements-local_chat.txt
View file @
3f9bbf11
fire
fire
transformers==4.
43.2
transformers==4.
51.3
numpy
numpy
torch>=2.3.0
torch>=2.3.0
packaging
packaging
...
...
custom_flashinfer
@
af4259e8
Compare
fd94393f
...
af4259e8
Subproject commit
fd94393fb5b8ba8bae9c0bd6ab1c2a429d81ac76
Subproject commit
af4259e8a33f095b419d1fd1733a50b22fc84c49
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment