Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a0ac95b0
"wrappers/python/src/vscode:/vscode.git/clone" did not exist on "e3b252043f1a45bad2253e6c755a48efacf38649"
Commit
a0ac95b0
authored
Mar 26, 2026
by
wanghl6
Browse files
per_token_group_quant_fp8 opt
parent
cb68935c
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
37 additions
and
2 deletions
+37
-2
vllm/model_executor/layers/quantization/utils/fp8_utils.py
vllm/model_executor/layers/quantization/utils/fp8_utils.py
+37
-2
No files found.
vllm/model_executor/layers/quantization/utils/fp8_utils.py
View file @
a0ac95b0
...
...
@@ -915,6 +915,37 @@ def _per_token_group_quant_fp8_colmajor(
tl
.
store
(
y_s_ptr
,
y_s
)
def
_lightop_per_token_group_quant_fp8_impl
(
x_q
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
x_s
:
torch
.
Tensor
,
group_size
:
int
,
eps
:
float
,
use_ue8m0
:
bool
,
)
->
None
:
from
lightop
import
op
op
.
per_token_group_quant_fp8
(
x_q
,
x
,
x_s
,
group_size
,
eps
,
use_ue8m0
)
def
_lightop_per_token_group_quant_fp8_fake
(
x_q
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
x_s
:
torch
.
Tensor
,
group_size
:
int
,
eps
:
float
,
use_ue8m0
:
bool
,
)
->
None
:
pass
direct_register_custom_op
(
"lightop_per_token_group_quant_fp8"
,
_lightop_per_token_group_quant_fp8_impl
,
mutates_args
=
[
"x_q"
,
"x_s"
],
fake_impl
=
_lightop_per_token_group_quant_fp8_fake
,
)
def
per_token_group_quant_fp8
(
x
:
torch
.
Tensor
,
group_size
:
int
,
...
...
@@ -981,6 +1012,10 @@ def per_token_group_quant_fp8(
shape
=
x
.
shape
[:
-
1
]
+
(
x
.
shape
[
-
1
]
//
group_size
,)
x_s
=
torch
.
empty
(
shape
,
device
=
x
.
device
,
dtype
=
torch
.
float32
)
if
envs
.
USE_LIGHTOP_PER_TOKEN_GROUP_QUANT_FP8
and
not
column_major_scales
:
torch
.
ops
.
vllm
.
lightop_per_token_group_quant_fp8
(
x_q
,
x
,
x_s
,
group_size
,
eps
,
use_ue8m0
)
return
x_q
,
x_s
# prefer CUDA kernel if available
# TODO(bnell): this causes some fp8 moe test to fail.
if
current_platform
.
is_cuda
()
and
x
.
is_contiguous
():
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment