Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
4c56e5db
Unverified
Commit
4c56e5db
authored
Mar 21, 2025
by
lukec
Committed by
GitHub
Mar 20, 2025
Browse files
Set deepgemm to the default value in the hopper architecture. (#4613)
parent
7b5fc719
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
16 additions
and
3 deletions
+16
-3
python/sglang/srt/layers/quantization/fp8_kernel.py
python/sglang/srt/layers/quantization/fp8_kernel.py
+9
-3
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+7
-0
No files found.
python/sglang/srt/layers/quantization/fp8_kernel.py
View file @
4c56e5db
...
@@ -26,11 +26,14 @@ from sglang.srt.utils import (
...
@@ -26,11 +26,14 @@ from sglang.srt.utils import (
direct_register_custom_op
,
direct_register_custom_op
,
get_device_core_count
,
get_device_core_count
,
get_device_name
,
get_device_name
,
get_device_sm
,
is_cuda
,
is_cuda
,
is_hip
,
is_hip
,
supports_custom_op
,
supports_custom_op
,
)
)
_enable_jit_deepgemm
=
False
_is_hip
=
is_hip
()
_is_hip
=
is_hip
()
fp8_type_
=
torch
.
float8_e4m3fnuz
if
_is_hip
else
torch
.
float8_e4m3fn
fp8_type_
=
torch
.
float8_e4m3fnuz
if
_is_hip
else
torch
.
float8_e4m3fn
...
@@ -39,9 +42,12 @@ if _is_cuda:
...
@@ -39,9 +42,12 @@ if _is_cuda:
import
deep_gemm
# `pip install "sgl-kernel>=0.0.4.post3"`
import
deep_gemm
# `pip install "sgl-kernel>=0.0.4.post3"`
from
sgl_kernel
import
sgl_per_token_group_quant_fp8
,
sgl_per_token_quant_fp8
from
sgl_kernel
import
sgl_per_token_group_quant_fp8
,
sgl_per_token_quant_fp8
logger
=
logging
.
getLogger
(
__name__
)
sm_version
=
get_device_sm
()
if
sm_version
>=
90
and
int
(
os
.
getenv
(
"SGL_ENABLE_JIT_DEEPGEMM"
,
"1"
)):
_enable_jit_deepgemm
=
True
_enable_jit_deepgemm
=
int
(
os
.
getenv
(
"SGL_ENABLE_JIT_DEEPGEMM"
,
"0"
))
logger
=
logging
.
getLogger
(
__name__
)
if
supports_custom_op
():
if
supports_custom_op
():
...
@@ -771,7 +777,7 @@ def w8a8_block_fp8_matmul(
...
@@ -771,7 +777,7 @@ def w8a8_block_fp8_matmul(
)
)
# deepgemm only support bf16
# deepgemm only support bf16
if
_is_cuda
and
C
.
dtype
==
torch
.
bfloat16
and
_enable_jit_deepgemm
:
if
C
.
dtype
==
torch
.
bfloat16
and
_enable_jit_deepgemm
:
if
supports_custom_op
():
if
supports_custom_op
():
torch
.
ops
.
sglang
.
deep_gemm_fp8_fp8_bf16_nt
(
A
,
As
,
B
,
Bs
,
C
)
torch
.
ops
.
sglang
.
deep_gemm_fp8_fp8_bf16_nt
(
A
,
As
,
B
,
Bs
,
C
)
else
:
else
:
...
...
python/sglang/srt/utils.py
View file @
4c56e5db
...
@@ -1006,6 +1006,13 @@ def get_amdgpu_memory_capacity():
...
@@ -1006,6 +1006,13 @@ def get_amdgpu_memory_capacity():
)
)
def
get_device_sm
():
if
torch
.
cuda
.
is_available
():
major
,
minor
=
torch
.
cuda
.
get_device_capability
()
return
major
*
10
+
minor
return
0
def
get_nvgpu_memory_capacity
():
def
get_nvgpu_memory_capacity
():
try
:
try
:
# Run nvidia-smi and capture the output
# Run nvidia-smi and capture the output
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment