Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
6e530515
Unverified
Commit
6e530515
authored
Dec 28, 2024
by
HandH1998
Committed by
GitHub
Dec 28, 2024
Browse files
update sgl_moe_align_block_size usage (#2617)
parent
77d1210b
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
11 additions
and
9 deletions
+11
-9
python/pyproject.toml
python/pyproject.toml
+1
-1
python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+10
-2
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+0
-6
No files found.
python/pyproject.toml
View file @
6e530515
...
@@ -23,7 +23,7 @@ runtime_common = ["aiohttp", "decord", "fastapi",
...
@@ -23,7 +23,7 @@ runtime_common = ["aiohttp", "decord", "fastapi",
"psutil"
,
"pydantic"
,
"python-multipart"
,
"psutil"
,
"pydantic"
,
"python-multipart"
,
"pyzmq>=25.1.2"
,
"torchao>=0.7.0"
,
"uvicorn"
,
"uvloop"
,
"pyzmq>=25.1.2"
,
"torchao>=0.7.0"
,
"uvicorn"
,
"uvloop"
,
"xgrammar>=0.1.6"
]
"xgrammar>=0.1.6"
]
srt
=
["sglang[runtime_common]
", "
torch
", "
vllm>=
0.6.3
.post
1
,
<=
0.6.4
.post
1
", "
cuda-python
", "
flashinfer==
0.1.6
", "
sgl-kernel>=
0.0.2
.post
9
"]
srt
=
["sglang[runtime_common]
", "
torch
", "
vllm>=
0.6.3
.post
1
,
<=
0.6.4
.post
1
", "
cuda-python
", "
flashinfer==
0.1.6
", "
sgl-kernel>=
0.0.2
.post
10
"]
# HIP (Heterogeneous-computing Interface for Portability) for AMD
# HIP (Heterogeneous-computing Interface for Portability) for AMD
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
...
...
python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
View file @
6e530515
...
@@ -272,8 +272,14 @@ def moe_align_block_size(
...
@@ -272,8 +272,14 @@ def moe_align_block_size(
(
max_num_m_blocks
,),
dtype
=
torch
.
int32
,
device
=
topk_ids
.
device
(
max_num_m_blocks
,),
dtype
=
torch
.
int32
,
device
=
topk_ids
.
device
)
)
num_tokens_post_pad
=
torch
.
empty
((
1
),
dtype
=
torch
.
int32
,
device
=
topk_ids
.
device
)
num_tokens_post_pad
=
torch
.
empty
((
1
),
dtype
=
torch
.
int32
,
device
=
topk_ids
.
device
)
# FIXME(zhyncs)
if
not_hip
and
num_experts
>=
224
:
if
not_hip
and
num_experts
>=
256
:
token_cnts_buffer
=
torch
.
empty
(
(
num_experts
+
1
)
*
num_experts
,
dtype
=
torch
.
int32
,
device
=
topk_ids
.
device
)
cumsum_buffer
=
torch
.
empty
(
num_experts
+
1
,
dtype
=
torch
.
int32
,
device
=
topk_ids
.
device
)
sgl_moe_align_block_size
(
sgl_moe_align_block_size
(
topk_ids
,
topk_ids
,
num_experts
,
num_experts
,
...
@@ -281,6 +287,8 @@ def moe_align_block_size(
...
@@ -281,6 +287,8 @@ def moe_align_block_size(
sorted_ids
,
sorted_ids
,
expert_ids
,
expert_ids
,
num_tokens_post_pad
,
num_tokens_post_pad
,
token_cnts_buffer
,
cumsum_buffer
,
)
)
else
:
else
:
ops
.
moe_align_block_size
(
ops
.
moe_align_block_size
(
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
6e530515
...
@@ -95,12 +95,6 @@ class ModelRunner:
...
@@ -95,12 +95,6 @@ class ModelRunner:
):
):
logger
.
info
(
"MLA optimization is turned on. Use triton backend."
)
logger
.
info
(
"MLA optimization is turned on. Use triton backend."
)
self
.
server_args
.
attention_backend
=
"triton"
self
.
server_args
.
attention_backend
=
"triton"
# FIXME(HandH1998)
if
(
"DeepseekV3ForCausalLM"
in
self
.
model_config
.
hf_config
.
architectures
and
not
self
.
server_args
.
disable_cuda_graph
):
self
.
server_args
.
disable_cuda_graph
=
True
if
self
.
server_args
.
enable_double_sparsity
:
if
self
.
server_args
.
enable_double_sparsity
:
logger
.
info
(
logger
.
info
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment