Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
63934543
Unverified
Commit
63934543
authored
May 25, 2025
by
Michael Goin
Committed by
GitHub
May 25, 2025
Browse files
Speed up the `kernels/quantization/` tests (#18669)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
75f81750
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
17 additions
and
25 deletions
+17
-25
tests/kernels/quantization/test_block_fp8.py
tests/kernels/quantization/test_block_fp8.py
+7
-7
tests/kernels/quantization/test_gguf.py
tests/kernels/quantization/test_gguf.py
+2
-2
tests/kernels/quantization/test_triton_scaled_mm.py
tests/kernels/quantization/test_triton_scaled_mm.py
+8
-16
No files found.
tests/kernels/quantization/test_block_fp8.py
View file @
63934543
...
@@ -36,16 +36,16 @@ vllm_config.scheduler_config.max_model_len = 8192
...
@@ -36,16 +36,16 @@ vllm_config.scheduler_config.max_model_len = 8192
# Test configurations
# Test configurations
DTYPES
=
[
torch
.
bfloat16
]
# [torch.half, torch.bfloat16, torch.float32]
DTYPES
=
[
torch
.
bfloat16
]
# [torch.half, torch.bfloat16, torch.float32]
NUM_TOKENS
=
[
7
,
83
,
2048
]
NUM_TOKENS
=
[
7
,
2050
]
D
=
[
512
,
4096
,
5120
,
13824
]
D
=
[
512
,
4096
,
5120
,
13824
]
GROUP_SIZE
=
[
64
,
128
,
256
,
512
]
GROUP_SIZE
=
[
64
,
128
,
512
]
M
=
[
1
,
7
,
8
,
83
,
84
,
512
,
2048
,
4096
]
M
=
[
1
,
7
,
8
,
83
,
84
,
4096
]
N
=
[
128
,
512
,
1024
,
4096
,
7168
,
7748
,
13824
]
N
=
[
128
,
512
,
7168
,
7748
,
13824
]
K
=
[
256
,
4096
,
5120
,
3884
,
13824
,
16384
]
K
=
[
256
,
3884
,
4096
,
13824
,
16384
]
# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
# and its hidden size is 7168.
# and its hidden size is 7168.
M_moe
=
[
1
,
2
,
7
,
83
,
128
,
512
,
2048
]
M_moe
=
[
1
,
2
,
7
,
83
,
128
,
2048
]
M_moe_dg
=
[
128
,
192
,
512
,
1335
,
2048
]
M_moe_dg
=
[
128
,
192
,
1335
,
2048
]
N_moe
=
[
128
,
256
,
1024
,
4608
]
# [13824]
N_moe
=
[
128
,
256
,
1024
,
4608
]
# [13824]
K_moe
=
[
256
,
512
,
7168
]
# [13824]
K_moe
=
[
256
,
512
,
7168
]
# [13824]
BLOCK_SIZE
=
[[
128
,
128
]]
BLOCK_SIZE
=
[[
128
,
128
]]
...
...
tests/kernels/quantization/test_gguf.py
View file @
63934543
...
@@ -35,11 +35,11 @@ def get_gguf_MoE_tensors(
...
@@ -35,11 +35,11 @@ def get_gguf_MoE_tensors(
return
GGUFReader
(
sample_file
).
tensors
return
GGUFReader
(
sample_file
).
tensors
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float32
]
DTYPES
=
[
torch
.
bfloat16
]
#
[torch.half, torch.bfloat16, torch.float32]
# Hidden_size for testing, must match the sample file in HF repo,
# Hidden_size for testing, must match the sample file in HF repo,
# we have `hidden_size = 256, 1024` for test in HF repo currently.
# we have `hidden_size = 256, 1024` for test in HF repo currently.
HIDDEN_SIZES
=
[
256
,
1024
]
HIDDEN_SIZES
=
[
256
,
1024
]
NUM_TOKENS
=
[
7
,
83
,
128
,
2048
]
# Arbitrary values for testing
NUM_TOKENS
=
[
7
,
2050
]
# Arbitrary values for testing
SEEDS
=
[
0
]
SEEDS
=
[
0
]
QUANT_TYPES
=
[
QUANT_TYPES
=
[
# i-matrix
# i-matrix
...
...
tests/kernels/quantization/test_triton_scaled_mm.py
View file @
63934543
...
@@ -13,8 +13,13 @@ from vllm.platforms import current_platform
...
@@ -13,8 +13,13 @@ from vllm.platforms import current_platform
device
=
"cuda"
device
=
"cuda"
triton_scaled_mm_module
=
importlib
.
import_module
(
"vllm.model_executor.layers.quantization.compressed_tensors."
"triton_scaled_mm"
)
triton_scaled_mm
=
triton_scaled_mm_module
.
triton_scaled_mm
def
scaled_mm_torch
(
a
:
torch
.
Tensor
,
def
torch_scaled_mm
(
a
:
torch
.
Tensor
,
b
:
torch
.
Tensor
,
b
:
torch
.
Tensor
,
scale_a
:
torch
.
Tensor
,
scale_a
:
torch
.
Tensor
,
scale_b
:
torch
.
Tensor
,
scale_b
:
torch
.
Tensor
,
...
@@ -101,21 +106,8 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
...
@@ -101,21 +106,8 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
if
use_bias
:
if
use_bias
:
bias
=
torch
.
rand
((
N
,
),
device
=
device
,
dtype
=
out_dtype
)
bias
=
torch
.
rand
((
N
,
),
device
=
device
,
dtype
=
out_dtype
)
triton_scaled_mm_module
=
importlib
.
import_module
(
"vllm.model_executor.layers.quantization.compressed_tensors."
"triton_scaled_mm"
)
triton_scaled_mm
=
triton_scaled_mm_module
.
triton_scaled_mm
c_check
=
triton_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
,
bias
)
c_check
=
triton_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
,
bias
)
a_cpu
=
a
.
cpu
()
c_actual
=
torch_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
,
bias
)
b_cpu
=
b
.
cpu
()
scale_a_cpu
=
scale_a
.
cpu
()
scale_b_cpu
=
scale_b
.
cpu
()
bias_cpu
=
None
if
bias
is
None
else
bias
.
cpu
()
c_actual
=
scaled_mm_torch
(
a_cpu
,
b_cpu
,
scale_a_cpu
,
scale_b_cpu
,
out_dtype
,
bias_cpu
)
c_check_cpu
=
c_check
.
cpu
()
torch
.
testing
.
assert_close
(
c_check
,
c_actual
,
rtol
=
1e-1
,
atol
=
1e-1
)
torch
.
testing
.
assert_close
(
c_check_cpu
,
c_actual
,
rtol
=
1e-1
,
atol
=
1e-1
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment