Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9dad5cc8
Unverified
Commit
9dad5cc8
authored
Jul 14, 2024
by
Tyler Michael Smith
Committed by
GitHub
Jul 14, 2024
Browse files
[Kernel] Turn off CUTLASS scaled_mm for Ada Lovelace (#6384)
parent
6ef3bf91
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
15 additions
and
9 deletions
+15
-9
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
...figs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+4
-4
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
...lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+3
-3
csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+8
-2
No files found.
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
View file @
9dad5cc8
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l
25
0 -f 5 -t 1
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l
100
0 -f 5 -t 1
model_name
:
"
nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
model_name
:
"
nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
tasks
:
tasks
:
-
name
:
"
gsm8k"
-
name
:
"
gsm8k"
metrics
:
metrics
:
-
name
:
"
exact_match,strict-match"
-
name
:
"
exact_match,strict-match"
value
:
0.75
2
value
:
0.75
5
-
name
:
"
exact_match,flexible-extract"
-
name
:
"
exact_match,flexible-extract"
value
:
0.75
2
value
:
0.75
5
limit
:
25
0
limit
:
100
0
num_fewshot
:
5
num_fewshot
:
5
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
View file @
9dad5cc8
...
@@ -4,8 +4,8 @@ tasks:
...
@@ -4,8 +4,8 @@ tasks:
-
name
:
"
gsm8k"
-
name
:
"
gsm8k"
metrics
:
metrics
:
-
name
:
"
exact_match,strict-match"
-
name
:
"
exact_match,strict-match"
value
:
0.75
6
value
:
0.75
3
-
name
:
"
exact_match,flexible-extract"
-
name
:
"
exact_match,flexible-extract"
value
:
0.75
2
value
:
0.75
3
limit
:
25
0
limit
:
100
0
num_fewshot
:
5
num_fewshot
:
5
csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
View file @
9dad5cc8
...
@@ -38,7 +38,13 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
...
@@ -38,7 +38,13 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
if
(
cuda_device_capability
>=
90
)
{
if
(
cuda_device_capability
>=
90
)
{
return
CUDA_VERSION
>=
12000
;
return
CUDA_VERSION
>=
12000
;
}
else
if
(
cuda_device_capability
>=
89
)
{
}
else
if
(
cuda_device_capability
>=
89
)
{
return
CUDA_VERSION
>=
12040
;
// CUTLASS Kernels have not been tuned for Ada Lovelace systems
// and are slower than torch.mm. Return false unconditionally in this case.
return
false
;
// Once the CUTLASS kernels have been optimized for Lovelace systems,
// use the following check:
// return CUDA_VERSION >= 12040;
}
}
#endif
#endif
...
@@ -98,4 +104,4 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
...
@@ -98,4 +104,4 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
TORCH_CHECK
(
version_num
>=
75
);
TORCH_CHECK
(
version_num
>=
75
);
cutlass_scaled_mm_sm75
(
c
,
a
,
b
,
a_scales
,
b_scales
,
bias
);
cutlass_scaled_mm_sm75
(
c
,
a
,
b
,
a_scales
,
b_scales
,
bias
);
}
}
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment