Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
2b1da821
Unverified
Commit
2b1da821
authored
Oct 21, 2025
by
Serge Panev
Committed by
GitHub
Oct 22, 2025
Browse files
[NVIDIA] Add new SMs support for Spark & Thor (#11287)
Signed-off-by:
Serge Panev
<
spanev@nvidia.com
>
parent
97710ccd
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
22 additions
and
8 deletions
+22
-8
python/sglang/srt/utils/common.py
python/sglang/srt/utils/common.py
+9
-1
sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu
sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu
+2
-2
sgl-kernel/csrc/gemm/nvfp4_quant.cuh
sgl-kernel/csrc/gemm/nvfp4_quant.cuh
+10
-4
sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
+1
-1
No files found.
python/sglang/srt/utils/common.py
View file @
2b1da821
...
...
@@ -452,7 +452,15 @@ def get_available_gpu_memory(
if
empty_cache
:
torch
.
cuda
.
empty_cache
()
free_gpu_memory
,
_
=
torch
.
cuda
.
mem_get_info
(
gpu_id
)
SHARED_SYSMEM_DEVICE_MEM_SMS
=
(
87
,
110
,
121
)
# Orin, Thor, Spark
if
get_device_sm
()
in
SHARED_SYSMEM_DEVICE_MEM_SMS
:
# On these devices, which use sysmem as device mem, torch.cuda.mem_get_info()
# only reports "free" memory, which can be lower than what is actually
# available due to not including cache memory. So we use the system available
# memory metric instead.
free_gpu_memory
=
psutil
.
virtual_memory
().
available
else
:
free_gpu_memory
,
_
=
torch
.
cuda
.
mem_get_info
(
gpu_id
)
elif
device
==
"xpu"
:
num_gpus
=
torch
.
xpu
.
device_count
()
...
...
sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu
View file @
2b1da821
...
...
@@ -568,7 +568,7 @@ void scaled_fp4_experts_quant_sm100a(
torch
::
Tensor
const
&
input_offset_by_experts
,
torch
::
Tensor
const
&
output_scale_offset_by_experts
)
{
auto
sm_version
=
getSMVersion
();
TORCH_CHECK
(
sm_version
=
=
100
||
sm_version
==
103
,
"fp4_quant is only supported on sm100
a/sm103a
"
);
TORCH_CHECK
(
sm_version
>
=
100
,
"fp4_quant is only supported on sm100
+
"
);
CHECK_INPUT
(
output
,
"output must be a CUDA tensor"
);
CHECK_INPUT
(
output_scale
,
"output_scale must be a CUDA tensor"
);
...
...
@@ -652,7 +652,7 @@ void silu_and_mul_scaled_fp4_experts_quant_sm100a(
torch
::
Tensor
const
&
mask
,
bool
use_silu_and_mul
)
{
auto
sm_version
=
getSMVersion
();
TORCH_CHECK
(
sm_version
=
=
100
||
sm_version
==
103
,
"fp4_quant is only supported on sm100
a/sm103a
"
);
TORCH_CHECK
(
sm_version
>
=
100
,
"fp4_quant is only supported on sm100
+
"
);
CHECK_INPUT
(
output
,
"output must be a CUDA tensor"
);
CHECK_INPUT
(
output_scale
,
"output_scale must be a CUDA tensor"
);
...
...
sgl-kernel/csrc/gemm/nvfp4_quant.cuh
View file @
2b1da821
...
...
@@ -50,8 +50,9 @@ constexpr int CVT_FP4_SF_VEC_SIZE = 16;
// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
inline
__device__
uint32_t
fp32_vec_to_e2m1
(
float
(
&
array
)[
8
])
{
// PTX instructions used here requires sm100a/sm103a.
#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED
// PTX instructions used here requires >= sm100f.
#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED || \
(defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ > 1000))
uint32_t
val
;
asm
volatile
(
"{
\n
"
...
...
@@ -76,14 +77,17 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
"f"
(
array
[
7
]));
return
val
;
#else
printf
(
"fp32_vec_to_e2m1 is not supported on this architecture
\n
"
);
__trap
();
return
0
;
#endif
}
// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
inline
__device__
uint32_t
fp32_vec_to_e2m1
(
float2
(
&
array
)[
4
])
{
// PTX instructions used here requires sm100a/sm103a.
#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED
// PTX instructions used here requires >= sm100f.
#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED || \
(defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ > 1000))
uint32_t
val
;
asm
volatile
(
"{
\n
"
...
...
@@ -108,6 +112,8 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
"f"
(
array
[
3
].
y
));
return
val
;
#else
printf
(
"fp32_vec_to_e2m1 is not supported on this architecture
\n
"
);
__trap
();
return
0
;
#endif
}
...
...
sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
View file @
2b1da821
...
...
@@ -202,7 +202,7 @@ inline int getMultiProcessorCount() {
void
scaled_fp4_quant_sm100a
(
torch
::
Tensor
&
output
,
torch
::
Tensor
const
&
input
,
torch
::
Tensor
&
output_sf
,
torch
::
Tensor
const
&
input_sf
)
{
auto
sm_version
=
getSMVersion
();
TORCH_CHECK
(
sm_version
=
=
100
||
sm_version
==
103
,
"fp4_quant is only supported on sm100
a/sm103a
"
);
TORCH_CHECK
(
sm_version
>
=
100
,
"fp4_quant is only supported on sm100
+
"
);
int32_t
m
=
input
.
size
(
0
);
int32_t
n
=
input
.
size
(
1
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment