Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
e9c6ce46
Unverified
Commit
e9c6ce46
authored
Apr 02, 2025
by
Xiaoyu Zhang
Committed by
GitHub
Apr 02, 2025
Browse files
sgl scaled_fp8_quant support output padding (#4861)
parent
3fadc647
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
61 additions
and
4 deletions
+61
-4
python/sglang/srt/custom_op.py
python/sglang/srt/custom_op.py
+5
-0
python/sglang/srt/layers/quantization/fp8_utils.py
python/sglang/srt/layers/quantization/fp8_utils.py
+1
-4
python/sglang/test/test_custom_ops.py
python/sglang/test/test_custom_ops.py
+55
-0
No files found.
python/sglang/srt/custom_op.py
View file @
e9c6ce46
...
@@ -50,6 +50,7 @@ if _is_cuda:
...
@@ -50,6 +50,7 @@ if _is_cuda:
def
scaled_fp8_quant
(
def
scaled_fp8_quant
(
input
:
torch
.
Tensor
,
input
:
torch
.
Tensor
,
scale
:
Optional
[
torch
.
Tensor
]
=
None
,
scale
:
Optional
[
torch
.
Tensor
]
=
None
,
num_token_padding
:
Optional
[
int
]
=
None
,
use_per_token_if_dynamic
:
bool
=
False
,
use_per_token_if_dynamic
:
bool
=
False
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
"""
...
@@ -59,6 +60,8 @@ if _is_cuda:
...
@@ -59,6 +60,8 @@ if _is_cuda:
input (torch.Tensor): Input tensor to be quantized
input (torch.Tensor): Input tensor to be quantized
scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
If None, scales will be computed dynamically.
If None, scales will be computed dynamically.
num_token_padding (Optional[int]): If specified, pad the first dimension
of the output to at least this value.
use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
determines the quantization granularity:
determines the quantization granularity:
- True: compute scale per token
- True: compute scale per token
...
@@ -75,6 +78,8 @@ if _is_cuda:
...
@@ -75,6 +78,8 @@ if _is_cuda:
assert
input
.
ndim
==
2
,
f
"Expected 2D input tensor, got
{
input
.
ndim
}
D"
assert
input
.
ndim
==
2
,
f
"Expected 2D input tensor, got
{
input
.
ndim
}
D"
shape
=
input
.
shape
shape
=
input
.
shape
out_dtype
=
torch
.
float8_e4m3fnuz
if
_is_hip
else
torch
.
float8_e4m3fn
out_dtype
=
torch
.
float8_e4m3fnuz
if
_is_hip
else
torch
.
float8_e4m3fn
if
num_token_padding
:
shape
=
(
max
(
num_token_padding
,
input
.
shape
[
0
]),
shape
[
1
])
output
=
torch
.
empty
(
shape
,
device
=
input
.
device
,
dtype
=
out_dtype
)
output
=
torch
.
empty
(
shape
,
device
=
input
.
device
,
dtype
=
out_dtype
)
if
scale
is
None
:
if
scale
is
None
:
...
...
python/sglang/srt/layers/quantization/fp8_utils.py
View file @
e9c6ce46
...
@@ -457,12 +457,9 @@ class Fp8LinearOp:
...
@@ -457,12 +457,9 @@ class Fp8LinearOp:
qinput
,
x_scale
=
sgl_scaled_fp8_quant
(
qinput
,
x_scale
=
sgl_scaled_fp8_quant
(
input_2d
,
input_2d
,
input_scale
,
input_scale
,
num_token_padding
=
self
.
output_padding
,
use_per_token_if_dynamic
=
use_per_token_if_dynamic
,
use_per_token_if_dynamic
=
use_per_token_if_dynamic
,
)
)
if
self
.
output_padding
:
pad_size
=
max
(
self
.
output_padding
-
qinput
.
shape
[
0
],
0
)
if
pad_size
>
0
:
qinput
=
torch
.
nn
.
functional
.
pad
(
qinput
,
(
0
,
0
,
0
,
pad_size
))
else
:
else
:
qinput
,
x_scale
=
ops
.
scaled_fp8_quant
(
qinput
,
x_scale
=
ops
.
scaled_fp8_quant
(
input_2d
,
input_2d
,
...
...
python/sglang/test/test_custom_ops.py
View file @
e9c6ce46
...
@@ -82,6 +82,61 @@ if is_cuda:
...
@@ -82,6 +82,61 @@ if is_cuda:
dequantize_per_token
(
ref_y
,
scale
,
dtype
),
dequantize_per_token
(
ref_y
,
scale
,
dtype
),
)
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
def
test_scaled_fp8_quant_with_padding
(
dtype
)
->
None
:
original_rows
=
5
x
=
(
torch
.
randn
(
size
=
(
original_rows
,
16
),
device
=
"cuda"
)
*
13
).
to
(
dtype
)
padding_size
=
10
# Test with dynamic quantization
y_dynamic
,
scale_dynamic
=
scaled_fp8_quant
(
x
,
None
,
num_token_padding
=
padding_size
)
# Verify output shape has the padded size
assert
y_dynamic
.
shape
[
0
]
==
padding_size
assert
y_dynamic
.
shape
[
1
]
==
x
.
shape
[
1
]
# Verify that the actual data in the non-padded region is correctly quantized
y_without_padding
,
scale_without_padding
=
scaled_fp8_quant
(
x
,
None
)
torch
.
testing
.
assert_close
(
y_dynamic
[:
original_rows
],
y_without_padding
)
# Test with static quantization
# First get a scale
_
,
scale
=
scaled_fp8_quant
(
x
,
None
)
# Then use it for static quantization with padding
y_static
,
_
=
scaled_fp8_quant
(
x
,
scale
,
num_token_padding
=
padding_size
)
# Verify output shape has the padded size
assert
y_static
.
shape
[
0
]
==
padding_size
assert
y_static
.
shape
[
1
]
==
x
.
shape
[
1
]
# Verify that the actual data in the non-padded region is correctly quantized
y_static_without_padding
,
_
=
scaled_fp8_quant
(
x
,
scale
)
torch
.
testing
.
assert_close
(
y_static
[:
original_rows
],
y_static_without_padding
)
# Test with per-token dynamic quantization
y_per_token
,
scale_per_token
=
scaled_fp8_quant
(
x
,
None
,
num_token_padding
=
padding_size
,
use_per_token_if_dynamic
=
True
)
# Verify output shape has the padded size
assert
y_per_token
.
shape
[
0
]
==
padding_size
assert
y_per_token
.
shape
[
1
]
==
x
.
shape
[
1
]
# Verify that the actual data in the non-padded region is correctly quantized
y_per_token_without_padding
,
scale_per_token_without_padding
=
scaled_fp8_quant
(
x
,
None
,
use_per_token_if_dynamic
=
True
)
torch
.
testing
.
assert_close
(
y_per_token
[:
original_rows
],
y_per_token_without_padding
)
torch
.
testing
.
assert_close
(
scale_per_token
[:
original_rows
],
scale_per_token_without_padding
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
# Run the specific test function directly
# Run the specific test function directly
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment