Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
76abd0c8
Unverified
Commit
76abd0c8
authored
Feb 05, 2025
by
Lucas Wilkinson
Committed by
GitHub
Feb 05, 2025
Browse files
[Bugfix] Better FP8 supported defaults
parent
5b19b930
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
22 additions
and
12 deletions
+22
-12
vllm/model_executor/layers/quantization/utils/fp8_utils.py
vllm/model_executor/layers/quantization/utils/fp8_utils.py
+17
-11
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+5
-1
No files found.
vllm/model_executor/layers/quantization/utils/fp8_utils.py
View file @
76abd0c8
...
@@ -15,7 +15,7 @@ from vllm.logger import init_logger
...
@@ -15,7 +15,7 @@ from vllm.logger import init_logger
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
_normalize_quant_group_shape
,
scaled_dequantize
)
_normalize_quant_group_shape
,
scaled_dequantize
)
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
apply_fp8_linear
)
CUTLASS_BLOCK_FP8_SUPPORTED
,
CUTLASS_FP8_SUPPORTED
,
apply_fp8_linear
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -38,7 +38,7 @@ def apply_w8a8_block_fp8_linear(
...
@@ -38,7 +38,7 @@ def apply_w8a8_block_fp8_linear(
weight_scale
:
torch
.
Tensor
,
weight_scale
:
torch
.
Tensor
,
input_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
input_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
cutlass_block_fp8_supported
:
bool
=
True
,
cutlass_block_fp8_supported
:
bool
=
CUTLASS_BLOCK_FP8_SUPPORTED
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
assert
input_scale
is
None
assert
input_scale
is
None
# View input as 2D matrix for fp8 methods
# View input as 2D matrix for fp8 methods
...
@@ -91,6 +91,8 @@ def apply_fp8_linear_generic(
...
@@ -91,6 +91,8 @@ def apply_fp8_linear_generic(
input_group_shape
:
Tuple
[
int
,
int
],
input_group_shape
:
Tuple
[
int
,
int
],
weight_group_shape
:
Tuple
[
int
,
int
],
weight_group_shape
:
Tuple
[
int
,
int
],
input_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
# static scale if one
input_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
# static scale if one
cutlass_fp8_supported
:
bool
=
CUTLASS_FP8_SUPPORTED
,
cutlass_block_fp8_supported
:
bool
=
CUTLASS_BLOCK_FP8_SUPPORTED
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
# View input as 2D matrix for fp8 methods
# View input as 2D matrix for fp8 methods
input
=
input
.
view
(
-
1
,
input
.
shape
[
-
1
])
input
=
input
.
view
(
-
1
,
input
.
shape
[
-
1
])
...
@@ -105,14 +107,18 @@ def apply_fp8_linear_generic(
...
@@ -105,14 +107,18 @@ def apply_fp8_linear_generic(
if
is_dim_blocked
(
0
,
weight
.
shape
,
weight_group_shape
[
0
])
\
if
is_dim_blocked
(
0
,
weight
.
shape
,
weight_group_shape
[
0
])
\
and
is_dim_blocked
(
1
,
weight
.
shape
,
weight_group_shape
[
1
])
and
\
and
is_dim_blocked
(
1
,
weight
.
shape
,
weight_group_shape
[
1
])
and
\
input_group_shape
==
(
1
,
weight_group_shape
[
1
]):
input_group_shape
==
(
1
,
weight_group_shape
[
1
]):
return
apply_w8a8_block_fp8_linear
(
input
,
weight
,
return
apply_w8a8_block_fp8_linear
(
input
,
weight
,
list
(
weight_group_shape
),
list
(
weight_group_shape
),
weight_scale
)
weight_scale
,
cutlass_block_fp8_supported
=
cutlass_block_fp8_supported
)
else
:
else
:
# Despite having linear in the it doesn't conform to
# Despite having linear in the it doesn't conform to
# `torch.nn.functional.linear` which is defined as `input @ weight.T`
# `torch.nn.functional.linear` which is defined as `input @ weight.T`
# so we explicitly transpose the weight matrix here
# so we explicitly transpose the weight matrix here
return
apply_fp8_linear
(
input
,
weight
.
T
,
weight_scale
.
T
,
return
apply_fp8_linear
(
input
,
weight
.
T
,
weight_scale
.
T
,
cutlass_fp8_supported
=
cutlass_fp8_supported
,
use_per_token_if_dynamic
=
\
use_per_token_if_dynamic
=
\
(
input_group_shape
==
(
1
,
input
.
shape
[
1
])))
(
input_group_shape
==
(
1
,
input
.
shape
[
1
])))
...
...
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
View file @
76abd0c8
...
@@ -42,6 +42,10 @@ def cutlass_block_fp8_supported() -> bool:
...
@@ -42,6 +42,10 @@ def cutlass_block_fp8_supported() -> bool:
return
ops
.
cutlass_scaled_mm_supports_block_fp8
(
capability
)
return
ops
.
cutlass_scaled_mm_supports_block_fp8
(
capability
)
CUTLASS_FP8_SUPPORTED
=
cutlass_fp8_supported
()
CUTLASS_BLOCK_FP8_SUPPORTED
=
cutlass_block_fp8_supported
()
def
per_tensor_dequantize
(
def
per_tensor_dequantize
(
tensor
:
torch
.
Tensor
,
inv_scale
:
Union
[
float
,
tensor
:
torch
.
Tensor
,
inv_scale
:
Union
[
float
,
torch
.
Tensor
])
->
torch
.
Tensor
:
torch
.
Tensor
])
->
torch
.
Tensor
:
...
@@ -109,7 +113,7 @@ def apply_fp8_linear(
...
@@ -109,7 +113,7 @@ def apply_fp8_linear(
input_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
input_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
input_scale_ub
:
Optional
[
torch
.
Tensor
]
=
None
,
input_scale_ub
:
Optional
[
torch
.
Tensor
]
=
None
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
cutlass_fp8_supported
:
bool
=
True
,
cutlass_fp8_supported
:
bool
=
CUTLASS_FP8_SUPPORTED
,
use_per_token_if_dynamic
:
bool
=
False
,
use_per_token_if_dynamic
:
bool
=
False
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
# ops.scaled_fp8_quant supports both dynamic and static quant.
# ops.scaled_fp8_quant supports both dynamic and static quant.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment