Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
4dbf4360
Unverified
Commit
4dbf4360
authored
Aug 14, 2025
by
eigen
Committed by
GitHub
Aug 14, 2025
Browse files
fix: zero_init buffer (#9065)
Co-authored-by:
Yineng Zhang
<
me@zhyncs.com
>
parent
3d6be1fb
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
27 additions
and
16 deletions
+27
-16
python/pyproject.toml
python/pyproject.toml
+2
-2
python/sglang/srt/entrypoints/engine.py
python/sglang/srt/entrypoints/engine.py
+1
-1
python/sglang/srt/layers/attention/flashinfer_backend.py
python/sglang/srt/layers/attention/flashinfer_backend.py
+1
-0
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
+1
-0
python/sglang/srt/layers/attention/trtllm_mha_backend.py
python/sglang/srt/layers/attention/trtllm_mha_backend.py
+8
-6
python/sglang/srt/layers/attention/trtllm_mla_backend.py
python/sglang/srt/layers/attention/trtllm_mla_backend.py
+10
-3
python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
...ame=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
+1
-1
python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json
...figs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json
+1
-1
python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
...ame=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
+1
-1
python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
...ame=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
+1
-1
No files found.
python/pyproject.toml
View file @
4dbf4360
...
...
@@ -63,7 +63,7 @@ srt = [
"torchaudio==2.8.0"
,
"torchvision"
,
"cuda-python"
,
"flashinfer_python==0.2.11.post
1
"
,
"flashinfer_python==0.2.11.post
3
"
,
]
blackwell
=
[
...
...
@@ -73,7 +73,7 @@ blackwell = [
"torchaudio==2.8.0"
,
"torchvision"
,
"cuda-python"
,
"flashinfer_python==0.2.11.post
1
"
,
"flashinfer_python==0.2.11.post
3
"
,
]
# HIP (Heterogeneous-computing Interface for Portability) for AMD
...
...
python/sglang/srt/entrypoints/engine.py
View file @
4dbf4360
...
...
@@ -647,7 +647,7 @@ def _set_envs_and_config(server_args: ServerArgs):
if
server_args
.
attention_backend
==
"flashinfer"
:
assert_pkg_version
(
"flashinfer_python"
,
"0.2.11.post
1
"
,
"0.2.11.post
3
"
,
"Please uninstall the old version and "
"reinstall the latest version by following the instructions "
"at https://docs.flashinfer.ai/installation.html."
,
...
...
python/sglang/srt/layers/attention/flashinfer_backend.py
View file @
4dbf4360
...
...
@@ -122,6 +122,7 @@ class FlashInferAttnBackend(AttentionBackend):
# Allocate buffers
global
global_workspace_buffer
if
global_workspace_buffer
is
None
:
# different from flashinfer zero_init_global_workspace_buffer
global_workspace_buffer
=
torch
.
empty
(
global_config
.
flashinfer_workspace_size
,
dtype
=
torch
.
uint8
,
...
...
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
View file @
4dbf4360
...
...
@@ -81,6 +81,7 @@ class FlashInferMLAAttnBackend(AttentionBackend):
# Allocate buffers
global
global_workspace_buffer
if
global_workspace_buffer
is
None
:
# different from flashinfer zero_init_global_workspace_buffer
global_workspace_buffer
=
torch
.
empty
(
global_config
.
flashinfer_workspace_size
,
dtype
=
torch
.
uint8
,
...
...
python/sglang/srt/layers/attention/trtllm_mha_backend.py
View file @
4dbf4360
...
...
@@ -23,10 +23,12 @@ if TYPE_CHECKING:
from
sglang.srt.speculative.spec_info
import
SpecInfo
# Constants
DEFAULT_WORKSPACE_SIZE_MB
=
128
# Memory workspace size in MB
DEFAULT_WORKSPACE_SIZE_MB
=
(
512
# Memory workspace size in MB, todo(Yingyi): read from config
)
# Reuse this workspace buffer across all TRTLLM MHA wrappers
global_workspace_buffer
=
None
global_
zero_init_
workspace_buffer
=
None
@
dataclass
...
...
@@ -73,14 +75,14 @@ class TRTLLMHAAttnBackend(FlashInferAttnBackend):
# Workspace allocation
self
.
workspace_size
=
DEFAULT_WORKSPACE_SIZE_MB
*
1024
*
1024
# Allocate buffers
global
global_workspace_buffer
if
global_workspace_buffer
is
None
:
global_workspace_buffer
=
torch
.
empty
(
global
global_
zero_init_
workspace_buffer
if
global_
zero_init_
workspace_buffer
is
None
:
global_
zero_init_
workspace_buffer
=
torch
.
zeros
(
self
.
workspace_size
,
dtype
=
torch
.
uint8
,
device
=
model_runner
.
device
,
)
self
.
workspace_buffer
=
global_workspace_buffer
self
.
workspace_buffer
=
global_
zero_init_
workspace_buffer
# CUDA graph state
self
.
decode_cuda_graph_metadata
=
{}
...
...
python/sglang/srt/layers/attention/trtllm_mla_backend.py
View file @
4dbf4360
...
...
@@ -39,6 +39,8 @@ DEFAULT_WORKSPACE_SIZE_MB = 128 # Memory workspace size in MB
# compute the LCM with other padding constraints.
TRTLLM_BLOCK_CONSTRAINT
=
128
global_zero_init_workspace_buffer
=
None
@
dataclass
class
TRTLLMMLADecodeMetadata
:
...
...
@@ -83,9 +85,14 @@ class TRTLLMMLABackend(FlashInferMLAAttnBackend):
# Workspace allocation
self
.
workspace_size
=
DEFAULT_WORKSPACE_SIZE_MB
*
1024
*
1024
self
.
workspace_buffer
=
torch
.
empty
(
self
.
workspace_size
,
dtype
=
torch
.
int8
,
device
=
self
.
device
)
global
global_zero_init_workspace_buffer
if
global_zero_init_workspace_buffer
is
None
:
global_zero_init_workspace_buffer
=
torch
.
zeros
(
self
.
workspace_size
,
dtype
=
torch
.
uint8
,
device
=
model_runner
.
device
,
)
self
.
workspace_buffer
=
global_zero_init_workspace_buffer
# CUDA graph state
self
.
decode_cuda_graph_metadata
=
{}
...
...
python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
View file @
4dbf4360
...
...
@@ -143,4 +143,4 @@
"num_warps"
:
4
,
"num_stages"
:
3
}
}
\ No newline at end of file
}
python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json
View file @
4dbf4360
...
...
@@ -143,4 +143,4 @@
"num_warps"
:
4
,
"num_stages"
:
4
}
}
\ No newline at end of file
}
python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
View file @
4dbf4360
...
...
@@ -143,4 +143,4 @@
"num_warps"
:
4
,
"num_stages"
:
3
}
}
\ No newline at end of file
}
python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
View file @
4dbf4360
...
...
@@ -143,4 +143,4 @@
"num_warps"
:
4
,
"num_stages"
:
3
}
}
\ No newline at end of file
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment